diff --git a/Demo/Demo.csproj b/Demo/Demo.csproj index 5efb0a5..4d71245 100644 --- a/Demo/Demo.csproj +++ b/Demo/Demo.csproj @@ -8,6 +8,7 @@ + diff --git a/Demo/Program.cs b/Demo/Program.cs index a46fb18..85fa730 100644 --- a/Demo/Program.cs +++ b/Demo/Program.cs @@ -1,5 +1,7 @@ -using mupdf; +using mupdf; using MuPDF.NET; +using MuPDF.NET4LLM; +using MuPDF.NET4LLM.Helpers; using SkiaSharp; using System; using System.Collections.Generic; @@ -18,6 +20,7 @@ using Font = MuPDF.NET.Font; using Morph = MuPDF.NET.Morph; using TextWriter = MuPDF.NET.TextWriter; +using Utils = MuPDF.NET.Utils; namespace Demo { @@ -39,46 +42,433 @@ class Program { static void Main(string[] args) { - TestInsertHtmlbox(); - TestLineAnnot(); - AnnotationsFreeText1.Run(args); - AnnotationsFreeText2.Run(args); - NewAnnots.Run(args); - TestHelloWorldToNewDocument(args); - TestHelloWorldToExistingDocument(args); - TestReadBarcode(args); - TestReadDataMatrix(); - TestWriteBarcode(args); - TestExtractTextWithLayout(args); - TestWidget(args); - TestColor(args); - TestCMYKRecolor(args); - TestSVGRecolor(args); - TestReplaceImage(args); - TestInsertImage(args); - TestGetImageInfo(args); - TestGetTextPageOcr(args); - TestCreateImagePage(args); - TestJoinPdfPages(args); - TestFreeTextAnnot(args); - TestTextFont(args); - TestMemoryLeak(); - TestDrawLine(); - TestWriteBarcode1(); - TestUnicodeDocument(); - TestMorph(); - TestMetadata(); - TestMoveFile(); - TestImageFilter(); - TestImageFilterOcr(); - CreateAnnotDocument(); - TestDrawShape(); - TestIssue213(); - TestIssue1880(); + //TestInsertHtmlbox(); + //TestLineAnnot(); + //AnnotationsFreeText1.Run(args); + //AnnotationsFreeText2.Run(args); + //NewAnnots.Run(args); + //TestHelloWorldToNewDocument(args); + //TestHelloWorldToExistingDocument(args); + //TestReadBarcode(args); + //TestReadDataMatrix(); + //TestWriteBarcode(args); + //TestExtractTextWithLayout(args); + //TestWidget(args); + //TestColor(args); + //TestCMYKRecolor(args); + //TestSVGRecolor(args); + //TestReplaceImage(args); + //TestInsertImage(args); + //TestGetImageInfo(args); + //TestGetTextPageOcr(args); + //TestCreateImagePage(args); + //TestJoinPdfPages(args); + //TestFreeTextAnnot(args); + //TestTextFont(args); + //TestMemoryLeak(); + //TestDrawLine(); + //TestWriteBarcode1(); + //TestUnicodeDocument(); + //TestMorph(); + //TestMetadata(); + //TestMoveFile(); + //TestImageFilter(); + //TestImageFilterOcr(); + //CreateAnnotDocument(); + //TestDrawShape(); + //TestIssue213(); + //TestIssue1880(); + //TestLLM(); + TestPyMuPdfRagToMarkdown(); // Uncomment to test PyMuPdfRag.ToMarkdown() directly + //TestTable(); return; } + static void TestTable() + { + Console.WriteLine("\n=== TestTable ======================="); + + try + { + string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf"); + + if (!File.Exists(testFilePath)) + { + Console.WriteLine($"Error: Test file not found: {testFilePath}"); + return; + } + + Console.WriteLine($"Loading PDF: {testFilePath}"); + Document doc = new Document(testFilePath); + Console.WriteLine($"Document loaded: {doc.PageCount} page(s)"); + + // Test on first page + Page page = doc[0]; + Console.WriteLine($"\nPage 0 - Rect: {page.Rect}"); + + // Test 1: Get tables with default strategy + Console.WriteLine("\n--- Test 1: Get tables with 'lines_strict' strategy ---"); + List tables = Utils.GetTables( + page, + clip: page.Rect, + vertical_strategy: "lines_strict", + horizontal_strategy: "lines_strict"); + + Console.WriteLine($"Found {tables.Count} table(s) on page 0"); + + if (tables.Count > 0) + { + for (int i = 0; i < tables.Count; i++) + { + Table table = tables[i]; + Console.WriteLine($"\n Table {i + 1}:"); + Console.WriteLine($" Rows: {table.row_count}"); + Console.WriteLine($" Columns: {table.col_count}"); + if (table.bbox != null) + { + Console.WriteLine($" BBox: ({table.bbox.X0:F2}, {table.bbox.Y0:F2}, {table.bbox.X1:F2}, {table.bbox.Y1:F2})"); + } + + // Display header information + if (table.header != null) + { + Console.WriteLine($" Header:"); + Console.WriteLine($" External: {table.header.external}"); + if (table.header.names != null && table.header.names.Count > 0) + { + Console.WriteLine($" Column names: {string.Join(", ", table.header.names)}"); + } + } + + // Extract table data + Console.WriteLine($"\n Extracting table data..."); + List> tableData = table.Extract(); + if (tableData != null && tableData.Count > 0) + { + Console.WriteLine($" Extracted {tableData.Count} row(s) of data"); + // Show first few rows as preview + int previewRows = Math.Min(3, tableData.Count); + for (int row = 0; row < previewRows; row++) + { + var rowData = tableData[row]; + if (rowData != null) + { + Console.WriteLine($" Row {row + 1}: {string.Join(" | ", rowData.Take(5))}"); // Show first 5 columns + } + } + if (tableData.Count > previewRows) + { + Console.WriteLine($" ... and {tableData.Count - previewRows} more row(s)"); + } + } + + // Convert to markdown + Console.WriteLine($"\n Converting to Markdown..."); + try + { + string markdown = table.ToMarkdown(clean: false, fillEmpty: true); + if (!string.IsNullOrEmpty(markdown)) + { + Console.WriteLine($" Markdown length: {markdown.Length} characters"); + // Save markdown to file + string markdownFile = $"table_{i + 1}_page0.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($" Markdown saved to: {markdownFile}"); + + // Show preview + int previewLength = Math.Min(200, markdown.Length); + Console.WriteLine($" Preview (first {previewLength} chars):"); + Console.WriteLine($" {markdown.Substring(0, previewLength)}..."); + } + } + catch (Exception ex) + { + Console.WriteLine($" Error converting to markdown: {ex.Message}"); + } + } + } + else + { + Console.WriteLine("No tables found. Trying with 'lines' strategy..."); + + // Test 2: Try with 'lines' strategy (less strict) + Console.WriteLine("\n--- Test 2: Get tables with 'lines' strategy ---"); + tables = Utils.GetTables( + page, + clip: page.Rect, + vertical_strategy: "lines", + horizontal_strategy: "lines"); + + Console.WriteLine($"Found {tables.Count} table(s) with 'lines' strategy"); + } + + // Test 3: Try with 'text' strategy + Console.WriteLine("\n--- Test 3: Get tables with 'text' strategy ---"); + List
textTables = Utils.GetTables( + page, + clip: page.Rect, + vertical_strategy: "text", + horizontal_strategy: "text"); + + Console.WriteLine($"Found {textTables.Count} table(s) with 'text' strategy"); + + // Test 4: Get tables from all pages + Console.WriteLine("\n--- Test 4: Get tables from all pages ---"); + int totalTables = 0; + for (int pageNum = 0; pageNum < doc.PageCount; pageNum++) + { + Page currentPage = doc[pageNum]; + List
pageTables = Utils.GetTables( + currentPage, + clip: currentPage.Rect, + vertical_strategy: "lines_strict", + horizontal_strategy: "lines_strict"); + + if (pageTables.Count > 0) + { + Console.WriteLine($" Page {pageNum}: {pageTables.Count} table(s)"); + totalTables += pageTables.Count; + } + currentPage.Dispose(); + } + Console.WriteLine($"Total tables found across all pages: {totalTables}"); + + page.Dispose(); + doc.Close(); + + Console.WriteLine("\n=== TestTable completed successfully ==="); + } + catch (Exception ex) + { + Console.WriteLine($"Error in TestTable: {ex.Message}"); + Console.WriteLine($"Stack trace: {ex.StackTrace}"); + throw; + } + } + + static void TestPyMuPdfRagToMarkdown() + { + Console.WriteLine("\n=== TestPyMuPdfRagToMarkdown ======================="); + + try + { + // Find a test PDF file + string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf"); + + Document doc = new Document(testFilePath); + Console.WriteLine($"Document loaded: {doc.PageCount} page(s)"); + Console.WriteLine($"Document name: {doc.Name}"); + + // Test 1: Basic ToMarkdown with default settings + Console.WriteLine("\n--- Test 1: Basic ToMarkdown (default settings) ---"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: null, // All pages + hdrInfo: null, // Auto-detect headers + writeImages: false, + embedImages: false, + ignoreImages: false, + ignoreGraphics: false, + detectBgColor: true, + imagePath: "", + imageFormat: "png", + imageSizeLimit: 0.05f, + filename: testFilePath, + forceText: true, + pageChunks: false, + pageSeparators: false, + margins: null, + dpi: 150, + pageWidth: 612, + pageHeight: null, + tableStrategy: "lines_strict", + graphicsLimit: null, + fontsizeLimit: 3.0f, + ignoreCode: false, + extractWords: false, + showProgress: false, + useGlyphs: false, + ignoreAlpha: false + ); + + string markdownFile = "TestPyMuPdfRag_Output.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($"Markdown output saved to: {markdownFile}"); + Console.WriteLine($"Markdown length: {markdown.Length} characters"); + if (markdown.Length > 0) + { + int previewLength = Math.Min(300, markdown.Length); + Console.WriteLine($"Preview (first {previewLength} chars):\n{markdown.Substring(0, previewLength)}..."); + } + } + catch (Exception ex) + { + Console.WriteLine($"Error in basic ToMarkdown: {ex.Message}"); + } + + // Test 2: ToMarkdown with IdentifyHeaders + Console.WriteLine("\n--- Test 2: ToMarkdown with IdentifyHeaders ---"); + try + { + var identifyHeaders = new IdentifyHeaders(doc, pages: null, bodyLimit: 12.0f, maxLevels: 6); + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0 }, // First page only + hdrInfo: identifyHeaders, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: testFilePath, + forceText: true, + showProgress: false + ); + + string markdownFile = "TestPyMuPdfRag_WithHeaders.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($"Markdown with headers saved to: {markdownFile}"); + Console.WriteLine($"Markdown length: {markdown.Length} characters"); + } + catch (Exception ex) + { + Console.WriteLine($"Error in ToMarkdown with IdentifyHeaders: {ex.Message}"); + } + + // Test 3: ToMarkdown with TocHeaders + Console.WriteLine("\n--- Test 3: ToMarkdown with TocHeaders ---"); + try + { + var tocHeaders = new TocHeaders(doc); + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0 }, // First page only + hdrInfo: tocHeaders, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: testFilePath, + forceText: true, + showProgress: false + ); + + string markdownFile = "TestPyMuPdfRag_WithToc.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($"Markdown with TOC headers saved to: {markdownFile}"); + Console.WriteLine($"Markdown length: {markdown.Length} characters"); + } + catch (Exception ex) + { + Console.WriteLine($"Error in ToMarkdown with TocHeaders: {ex.Message}"); + } + + // Test 4: ToMarkdown with page separators + Console.WriteLine("\n--- Test 4: ToMarkdown with page separators ---"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: null, // All pages + hdrInfo: null, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: testFilePath, + forceText: true, + pageSeparators: true, // Add page separators + showProgress: false + ); + + string markdownFile = "TestPyMuPdfRag_WithSeparators.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($"Markdown with page separators saved to: {markdownFile}"); + Console.WriteLine($"Markdown length: {markdown.Length} characters"); + } + catch (Exception ex) + { + Console.WriteLine($"Error in ToMarkdown with page separators: {ex.Message}"); + } + + // Test 5: ToMarkdown with progress bar + Console.WriteLine("\n--- Test 5: ToMarkdown with progress bar ---"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: null, // All pages + hdrInfo: null, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: testFilePath, + forceText: true, + showProgress: true, // Show progress bar + pageSeparators: false + ); + + string markdownFile = "TestPyMuPdfRag_WithProgress.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($"\nMarkdown with progress saved to: {markdownFile}"); + Console.WriteLine($"Markdown length: {markdown.Length} characters"); + } + catch (Exception ex) + { + Console.WriteLine($"Error in ToMarkdown with progress: {ex.Message}"); + } + + doc.Close(); + } + catch (Exception ex) + { + Console.WriteLine($"An unexpected error occurred during PyMuPdfRag test: {ex.Message}"); + Console.WriteLine($"Stack trace: {ex.StackTrace}"); + } + + Console.WriteLine("\n=== TestPyMuPdfRagToMarkdown Completed ======================="); + } + + static void TestLLM() + { + Console.WriteLine("\n=== TestLLM ======================="); + + try + { + // Display version information + Console.WriteLine($"MuPDF.NET4LLM Version: {MuPDF4LLM.Version}"); + var versionTuple = MuPDF4LLM.VersionTuple; + Console.WriteLine($"Version Tuple: ({versionTuple.major}, {versionTuple.minor}, {versionTuple.patch})"); + + // Test with a sample PDF file + string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf"); + + // Try to find a PDF with actual content if Blank.pdf doesn't work well + if (!File.Exists(testFilePath)) + { + testFilePath = Path.GetFullPath("../../../TestDocuments/Widget.pdf"); + } + + if (!File.Exists(testFilePath)) + { + Console.WriteLine($"Test PDF file not found. Skipping LLM test."); + return; + } + + Console.WriteLine($"\nTesting with PDF: {testFilePath}"); + + Document doc = new Document(testFilePath); + Console.WriteLine($"Document loaded: {doc.PageCount} page(s)"); + + doc.Close(); + Console.WriteLine("\nLLM test completed successfully."); + } + catch (Exception ex) + { + Console.WriteLine($"Error in TestLLM: {ex.Message}"); + Console.WriteLine($"Stack trace: {ex.StackTrace}"); + } + } + static void TestIssue1880() { Console.WriteLine("\n=== TestIssue1880 ======================="); diff --git a/Demo/TestDocuments/Magazine.pdf b/Demo/TestDocuments/Magazine.pdf new file mode 100644 index 0000000..c8e166e Binary files /dev/null and b/Demo/TestDocuments/Magazine.pdf differ diff --git a/Demo/TestDocuments/national-capitals.pdf b/Demo/TestDocuments/national-capitals.pdf new file mode 100644 index 0000000..d2b4721 Binary files /dev/null and b/Demo/TestDocuments/national-capitals.pdf differ diff --git a/Demo/annotations-freetext2.cs b/Demo/annotations-freetext2.cs index fa5493c..fb3a0e3 100644 --- a/Demo/annotations-freetext2.cs +++ b/Demo/annotations-freetext2.cs @@ -25,7 +25,7 @@ public static void Run(string[] args) // the annotation text with HTML and styling syntax string text = $@"

-PyMuPDF འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན། +MuPDF.NET འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན། Here is some bold and italic text, followed by bold-italic. Text-based check boxes: {bullet}.

"; diff --git a/MuPDF.NET.Test/AnnotTest.cs b/MuPDF.NET.Test/AnnotTest.cs index f34a372..9573c38 100644 --- a/MuPDF.NET.Test/AnnotTest.cs +++ b/MuPDF.NET.Test/AnnotTest.cs @@ -334,7 +334,7 @@ public void TestRichText() string bullet = "\u2610\u2611\u2612"; // Output: ☐☑☒; string text = $@"

-PyMuPDF འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན། +MuPDF.NET འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན། Here is some bold and italic text, followed by bold-italic. Text-based check boxes: {bullet}.

"; diff --git a/MuPDF.NET.Test/GeneralTest.cs b/MuPDF.NET.Test/GeneralTest.cs index 2c63deb..82dfe02 100644 --- a/MuPDF.NET.Test/GeneralTest.cs +++ b/MuPDF.NET.Test/GeneralTest.cs @@ -486,14 +486,6 @@ assert repr(ee) == expected, f'Expected {expected=} but got {repr(ee)=}.' { Console.WriteLine($"test_2548(): {Utils.MUPDF_WARNINGS_STORE[i]}"); } - - // This checks that PyMuPDF 1.23.7 fixes this bug, and also that earlier - // versions with updated MuPDF also fix the bug. - //rebased = hasattr(pymupdf, 'mupdf') - //expected = 'format error: cycle in structure tree\nstructure tree broken, assume tree is missing' - //if rebased: - // assert wt == expected, f'expected:\n {expected!r}\nwt:\n {wt!r}\n' - //assert not e } [Test] diff --git a/MuPDF.NET.Test/resources/test_1645_expected.pdf b/MuPDF.NET.Test/resources/test_1645_expected.pdf index 1196788..55f59f4 100644 Binary files a/MuPDF.NET.Test/resources/test_1645_expected.pdf and b/MuPDF.NET.Test/resources/test_1645_expected.pdf differ diff --git a/MuPDF.NET.sln b/MuPDF.NET.sln index 3a66542..755bc8f 100644 --- a/MuPDF.NET.sln +++ b/MuPDF.NET.sln @@ -1,4 +1,4 @@ - + Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 VisualStudioVersion = 17.14.36511.14 @@ -9,6 +9,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MuPDF.NET.Test", "MuPDF.NET EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Demo", "Demo\Demo.csproj", "{D1CCB24F-A868-F185-9228-8CC249247C79}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MuPDF.NET4LLM", "MuPDF.NET4LLM\MuPDF.NET4LLM.csproj", "{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MuPDF.NET4LLM.Test", "MuPDF.NET4LLM.Test\MuPDF.NET4LLM.Test.csproj", "{5498436C-E1C0-418D-9DA3-0460A3C15953}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -55,6 +59,30 @@ Global {D1CCB24F-A868-F185-9228-8CC249247C79}.Release|x64.Build.0 = Release|x64 {D1CCB24F-A868-F185-9228-8CC249247C79}.Release|x86.ActiveCfg = Release|x86 {D1CCB24F-A868-F185-9228-8CC249247C79}.Release|x86.Build.0 = Release|x86 + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|Any CPU.Build.0 = Debug|Any CPU + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x64.ActiveCfg = Debug|x64 + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x64.Build.0 = Debug|x64 + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x86.ActiveCfg = Debug|x86 + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x86.Build.0 = Debug|x86 + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|Any CPU.ActiveCfg = Release|Any CPU + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|Any CPU.Build.0 = Release|Any CPU + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x64.ActiveCfg = Release|x64 + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x64.Build.0 = Release|x64 + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x86.ActiveCfg = Release|x86 + {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x86.Build.0 = Release|x86 + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|Any CPU.Build.0 = Debug|Any CPU + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x64.ActiveCfg = Debug|Any CPU + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x64.Build.0 = Debug|Any CPU + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x86.ActiveCfg = Debug|Any CPU + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x86.Build.0 = Debug|Any CPU + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|Any CPU.ActiveCfg = Release|Any CPU + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|Any CPU.Build.0 = Release|Any CPU + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x64.ActiveCfg = Release|Any CPU + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x64.Build.0 = Release|Any CPU + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x86.ActiveCfg = Release|Any CPU + {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/MuPDF.NET/Document.cs b/MuPDF.NET/Document.cs index 0622056..864e4d0 100644 --- a/MuPDF.NET/Document.cs +++ b/MuPDF.NET/Document.cs @@ -1685,7 +1685,7 @@ private void _DeletePage(int pno) /// Create a table of contents. /// /// a bool to control output. - /// Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation. + /// Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see MuPDF's documentation. /// public List GetToc(bool simple = true) { @@ -5950,6 +5950,23 @@ public void Bake(bool annots = true, bool widgets = true) pdf.Dispose(); } + public void Dispose() + { + if (IsClosed) + throw new Exception("document closed"); + + if (Outline != null) + { + Outline.Dispose(); + Outline = null; + } + ResetPageRefs(); + IsClosed = true; + GraftMaps = new Dictionary(); + _nativeDocument.Dispose(); + _nativeDocument = null; + } + public void Close() { if (IsClosed) @@ -6000,7 +6017,7 @@ public int AddOcg( PdfObj useFor = ocg.pdf_dict_put_dict(new PdfObj("Usage"), 3); PdfObj ciName = mupdf.mupdf.pdf_new_name("CreatorInfo"); PdfObj creInfo = useFor.pdf_dict_put_dict(ciName, 2); - creInfo.pdf_dict_put_text_string(new PdfObj("Creator"), "PyMuPDF"); + creInfo.pdf_dict_put_text_string(new PdfObj("Creator"), "MuPDF"); if (!string.IsNullOrEmpty(usage)) creInfo.pdf_dict_put_name(new PdfObj("Subtype"), usage); diff --git a/MuPDF.NET/Page.cs b/MuPDF.NET/Page.cs index 1f192d9..83bf2cd 100644 --- a/MuPDF.NET/Page.cs +++ b/MuPDF.NET/Page.cs @@ -1,4 +1,4 @@ -using mupdf; +using mupdf; using SkiaSharp; using System; using System.Collections; @@ -10,7 +10,6 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Text; -using static MuPDF.NET.Global; using static System.Net.Mime.MediaTypeNames; namespace MuPDF.NET diff --git a/MuPDF.NET/Table.cs b/MuPDF.NET/Table.cs index 5fbcbd3..3d432f2 100644 --- a/MuPDF.NET/Table.cs +++ b/MuPDF.NET/Table.cs @@ -1,2007 +1,2296 @@ -using System; +/* +Copyright (C) 2023 Artifex Software, Inc. + +This file is part of MuPDF.NET. + +MuPDF.NET is free software: you can redistribute it and/or modify it under the +terms of the GNU Affero General Public License as published by the Free +Software Foundation, either version 3 of the License, or (at your option) +any later version. + +MuPDF.NET is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +details. + +You should have received a copy of the GNU Affero General Public License +along with MuPDF. If not, see + +Alternative licensing terms are available from the licensor. +For commercial licensing, see or contact +Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +CA 94129, USA, for further information. + +--------------------------------------------------------------------- +Portions of this code have been ported from pdfplumber, see +https://pypi.org/project/pdfplumber/. + +The ported code is under the following MIT license: + +--------------------------------------------------------------------- +The MIT License (MIT) + +Copyright (c) 2015, Jeremy Singer-Vine + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +--------------------------------------------------------------------- +Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt +--------------------------------------------------------------------- + +The porting mainly pertains to files "table.py" and relevant parts of +"utils/text.py" within pdfplumber's repository on Github. +With respect to "text.py", we have removed functions or features that are not +used by table processing. Examples are: + +* the text search function +* simple text extraction +* text extraction by lines + +Original pdfplumber code does neither detect, nor identify table headers. +This MuPDF.NET port adds respective code to the 'Table' class as method '_get_header'. +This is implemented as new class TableHeader with the properties: +* bbox: A tuple for the header's bbox +* cells: A tuple for each bbox of a column header +* names: A list of strings with column header text +* external: A bool indicating whether the header is outside the table cells. + +*/ + +using mupdf; +using System; using System.Collections.Generic; -using System.Data; +using System.Drawing; using System.Linq; -using System.Net; -using System.Reflection; using System.Text; -using static MuPDF.NET.Global; +using System.Text.RegularExpressions; namespace MuPDF.NET { - public class Global + // Global state for table processing + internal static class TableGlobals { - public class Edge - { - public float x0; - public float y0; - public float x1; - public float y1; - public float width; - public float height; - public Point[] pts; - public float linewidth; - public bool stroke; - public bool fill; - public bool evenodd; - public float[] stroking_color; - public float[] non_stroking_color; - public string object_type; - public int page_number; - public object stroking_pattern; - public object non_stroking_pattern; - public float top; - public float bottom; - public float doctop; - public string orientation; - } - - public class Character - { - public float adv; - public float bottom; - public float doctop; - public string fontname; - public float height; - public Matrix matrix; - public string ncs; - public int non_stroking_color; - public object non_stroking_pattern; - public string object_type; - public int page_number; - public float size; - public int stroking_color; - public object stroking_pattern; - public string text; - public float top; - public bool upright; - public int direction; - public int rotation; - public float width; - public float x0; - public float x1; - public float y0; - public float y1; - } - - // Function to check if the extracted text contains only whitespace characters - public static bool whiteSpaces_issuperset(string text) - { - HashSet whiteSpaces = new HashSet(new[] { - ' ', '\t', '\n', '\r', '\v', '\f' - }); - // Check if all characters in the extracted text are whitespace characters - return text.All(c => whiteSpaces.Contains(c)); - } + internal static List EDGES = new List(); // vector graphics from MuPDF + internal static List CHARS = new List(); // text characters from MuPDF + internal static TextPage TEXTPAGE = null; // textpage for cell text extraction + + // Constants matching Python implementation from __init__.py + internal static readonly HashSet WHITE_SPACES = new HashSet { ' ', '\t', '\n', '\r', '\f', '\v' }; + // From __init__.py: TEXT_FONT_BOLD = 16, but for char flags use FZ_STEXT_BOLD + internal static readonly int TEXT_BOLD = (int)mupdf.mupdf.FZ_STEXT_BOLD; + // From __init__.py: TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT + internal static readonly int TEXT_STRIKEOUT = (int)mupdf.mupdf.FZ_STEXT_STRIKEOUT; + // From __init__.py: TEXT_COLLECT_STYLES = mupdf.FZ_STEXT_COLLECT_STYLES + internal static readonly int TEXT_COLLECT_STYLES = (int)mupdf.mupdf.FZ_STEXT_COLLECT_STYLES; + // From __init__.py: TEXT_COLLECT_VECTORS = mupdf.FZ_STEXT_COLLECT_VECTORS + internal static readonly int TEXT_COLLECT_VECTORS = (int)mupdf.mupdf.FZ_STEXT_COLLECT_VECTORS; + // From __init__.py: TEXT_SEGMENT = mupdf.FZ_STEXT_SEGMENT + internal static readonly int TEXT_SEGMENT = (int)mupdf.mupdf.FZ_STEXT_SEGMENT; + // From table.py FLAGS: TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES | TEXT_ACCURATE_BBOXES | TEXT_MEDIABOX_CLIP + internal static readonly int FLAGS = + (int)TextFlagsExtension.TEXTFLAGS_TEXT | + TEXT_COLLECT_STYLES | + (int)TextFlags.TEXT_ACCURATE_BBOXES | + (int)TextFlags.TEXT_MEDIABOX_CLIP; + // From table.py TABLE_DETECTOR_FLAGS: TEXT_ACCURATE_BBOXES | TEXT_SEGMENT | TEXT_COLLECT_VECTORS | TEXT_MEDIABOX_CLIP + internal static readonly int TABLE_DETECTOR_FLAGS = + (int)TextFlags.TEXT_ACCURATE_BBOXES | + TEXT_SEGMENT | + TEXT_COLLECT_VECTORS | + (int)TextFlags.TEXT_MEDIABOX_CLIP; + } - public class BBox + // Constants + internal static class TableConstants + { + internal static readonly string[] NON_NEGATIVE_SETTINGS = { + "snap_tolerance", "snap_x_tolerance", "snap_y_tolerance", + "join_tolerance", "join_x_tolerance", "join_y_tolerance", + "edge_min_length", "min_words_vertical", "min_words_horizontal", + "intersection_tolerance", "intersection_x_tolerance", "intersection_y_tolerance" + }; + + internal static readonly Dictionary LIGATURES = new Dictionary { - public float x0 { get; set; } - public float top { get; set; } - public float x1 { get; set; } - public float bottom { get; set; } + { "ff", "ff" }, + { "ffi", "ffi" }, + { "ffl", "ffl" }, + { "fi", "fi" }, + { "fl", "fl" }, + { "st", "st" }, + { "ſt", "st" } + }; + } - public BBox(float x0, float top, float x1, float bottom) - { - this.x0 = x0; - this.top = top; - this.x1 = x1; - this.bottom = bottom; - } + // Character dictionary structure matching Python implementation + internal class CharDict + { + public float adv { get; set; } + public float bottom { get; set; } + public float doctop { get; set; } + public string fontname { get; set; } + public float height { get; set; } + public Tuple matrix { get; set; } + public string ncs { get; set; } + public Tuple non_stroking_color { get; set; } + public object non_stroking_pattern { get; set; } + public string object_type { get; set; } + public int page_number { get; set; } + public float size { get; set; } + public Tuple stroking_color { get; set; } + public object stroking_pattern { get; set; } + public bool bold { get; set; } + public string text { get; set; } + public float top { get; set; } + public bool upright { get; set; } + public float width { get; set; } + public float x0 { get; set; } + public float x1 { get; set; } + public float y0 { get; set; } + public float y1 { get; set; } + } - // Union method: Combine two rectangles into one that covers both. - public BBox Union(BBox other) - { - float newX0 = Math.Min(this.x0, other.x0); - float newTop = Math.Min(this.top, other.top); - float newX1 = Math.Max(this.x1, other.x1); - float newBottom = Math.Max(this.bottom, other.bottom); + // Edge structure for table detection + public class Edge + { + public float x0 { get; set; } + public float x1 { get; set; } + public float top { get; set; } + public float bottom { get; set; } + public float width { get; set; } + public float height { get; set; } + public string orientation { get; set; } // "h" or "v" + public string object_type { get; set; } + public float doctop { get; set; } + public int page_number { get; set; } + public float y0 { get; set; } + public float y1 { get; set; } + } - return new BBox(newX0, newTop, newX1, newBottom); - } + // Helper functions + internal static class TableHelpers + { + // rect_in_rect - Check whether rectangle 'inner' is fully inside rectangle 'outer' + internal static bool RectInRect(Rect inner, Rect outer) + { + return inner.X0 >= outer.X0 && inner.Y0 >= outer.Y0 && + inner.X1 <= outer.X1 && inner.Y1 <= outer.Y1; + } - // Overload the |= operator to union two rectangles. - public static BBox operator |(BBox r1, BBox r2) - { - return r1.Union(r2); - } + // chars_in_rect - Check whether any of the chars are inside rectangle + internal static bool CharsInRect(List chars, Rect rect) + { + return chars.Any(c => + rect.X0 <= c.x0 && c.x1 <= rect.X1 && + rect.Y0 <= c.y0 && c.y1 <= rect.Y1); + } - public bool IsEmpty() - { - if (x0 == 0 && top == 0 && x1 == 0 && bottom == 0) - return true; - return false; - } + // _iou - Compute intersection over union of two rectangles + internal static float Iou(Rect r1, Rect r2) + { + float ix = Math.Max(0, Math.Min(r1.X1, r2.X1) - Math.Max(r1.X0, r2.X0)); + float iy = Math.Max(0, Math.Min(r1.Y1, r2.Y1) - Math.Max(r1.Y0, r2.Y0)); + float intersection = ix * iy; + if (intersection == 0) + return 0; + float area1 = (r1.X1 - r1.X0) * (r1.Y1 - r1.Y0); + float area2 = (r2.X1 - r2.X0) * (r2.Y1 - r2.Y0); + return intersection / (area1 + area2 - intersection); + } - // Override Equals and GetHashCode for Distinct to work correctly - public override bool Equals(object obj) - { - return obj is BBox bbox && - x0 == bbox.x0 && - top == bbox.top && - x1 == bbox.x1 && - bottom == bbox.bottom; - } + // intersects_words_h - Check whether any words are cut through by horizontal line y + internal static bool IntersectsWordsH(Rect bbox, float y, List wordRects) + { + return wordRects.Any(r => RectInRect(r, bbox) && r.Y0 < y && y < r.Y1); + } - public static int CombineHashCodes(float x0, float top, float x1, float bottom) - { - // Start with a prime number to mix in the values. - int hash = 17; + // get_table_dict_from_rect - Extract MuPDF table structure information + // Note: This requires native MuPDF interop to call fz_find_table_within_bounds + // The Python version calls: pymupdf.extra.make_table_dict(textpage.this.m_internal, table_dict, rect) + // This would need to be implemented via P/Invoke or native wrapper + internal static Dictionary GetTableDictFromRect(TextPage textpage, Rect rect) + { + var tableDict = new Dictionary(); + // TODO: Implement native interop call to MuPDF's table detection function + // This is used by make_table_from_bbox which is called when layout_information finds tables + return tableDict; + } - // Combine each hash code using XOR and a prime number multiplier. - hash = hash * 31 + x0.GetHashCode(); - hash = hash * 31 + top.GetHashCode(); - hash = hash * 31 + x1.GetHashCode(); - hash = hash * 31 + bottom.GetHashCode(); + // make_table_from_bbox - Detect table structure within a given rectangle + internal static List MakeTableFromBbox(TextPage textpage, List wordRects, Rect rect) + { + var cells = new List(); + var block = GetTableDictFromRect(textpage, rect); + + if (!block.ContainsKey("type") || Convert.ToInt32(block["type"]) != mupdf.mupdf.FZ_STEXT_BLOCK_GRID) + return cells; + + var bboxList = block["bbox"] as List; + if (bboxList == null || bboxList.Count < 4) + return cells; + + var bbox = new Rect( + Convert.ToSingle(bboxList[0]), + Convert.ToSingle(bboxList[1]), + Convert.ToSingle(bboxList[2]), + Convert.ToSingle(bboxList[3]) + ); - return hash; + var xpos = (block["xpos"] as List)?.Cast>() + .Select(x => Tuple.Create(Convert.ToSingle(x[0]), Convert.ToSingle(x[1]))) + .OrderBy(x => x.Item1).ToList() ?? new List>(); + + var ypos = (block["ypos"] as List)?.Cast>() + .Select(y => Tuple.Create(Convert.ToSingle(y[0]), Convert.ToSingle(y[1]))) + .OrderBy(y => y.Item1).ToList() ?? new List>(); + + var maxUncertain = block["max_uncertain"] as List; + float xmaxu = maxUncertain != null && maxUncertain.Count > 0 ? Convert.ToSingle(maxUncertain[0]) : 0; + float ymaxu = maxUncertain != null && maxUncertain.Count > 1 ? Convert.ToSingle(maxUncertain[1]) : 0; + + // Modify ypos to remove uncertain positions + var nypos = new List(); + foreach (var (y, yunc) in ypos) + { + if (yunc > 0) continue; + if (IntersectsWordsH(bbox, y, wordRects)) continue; + if (nypos.Count > 0 && (y - nypos[nypos.Count - 1] < 3)) + nypos[nypos.Count - 1] = y; + else + nypos.Add(y); } - public override int GetHashCode() - { - //return HashCode.Combine(x0, top, x1, bottom); - return CombineHashCodes(x0, top, x1, bottom); - } + ymaxu = Math.Max(0, (float)Math.Round((nypos.Count - 2) * 0.35)); - public static BBox RectToBBox(Rect rect) - { - return new BBox(rect.X0, rect.Y0, rect.X1, rect.Y1); - } + var nxpos = xpos.Where(x => x.Item2 <= ymaxu).Select(x => x.Item1).ToList(); + if (bbox.X1 > nxpos[nxpos.Count - 1] + 3) + nxpos.Add(bbox.X1); - public static Rect BBoxToRect(BBox bbox) + // Compose cells from remaining x and y positions + for (int i = 0; i < nypos.Count - 1; i++) { - return new Rect(bbox.x0, bbox.top, bbox.x1, bbox.bottom); + var rowBox = new Rect(bbox.X0, nypos[i], bbox.X1, nypos[i + 1]); + var rowWords = wordRects.Where(r => RectInRect(r, rowBox)) + .OrderBy(r => r.X0).ToList(); + + var thisXpos = nxpos.Where(x => !rowWords.Any(r => r.X0 < x && x < r.X1)).ToList(); + + for (int j = 0; j < thisXpos.Count - 1; j++) + { + var cell = new Rect(thisXpos[j], nypos[i], thisXpos[j + 1], nypos[i + 1]); + if (!cell.IsEmpty) + cells.Add(cell); + } } + + return cells; } - public static Edge line_to_edge(Edge line) + // extract_cells - Extract text from a cell as plain or MD styled text + internal static string ExtractCells(TextPage textpage, Rect cell, bool markdown = false) { - // Create a new dictionary to hold the edge data - var edge = line; - - // Determine the orientation - string orientation = (Convert.ToSingle(line.top) == Convert.ToSingle(line.bottom)) ? "h" : "v"; + if (textpage == null) + return ""; - // Add or update the "orientation" key in the dictionary - edge.orientation = orientation; + var text = new StringBuilder(); + var pageInfo = textpage.ExtractRAWDict(cropbox: null, sort: false); - return edge; - } + if (pageInfo?.Blocks == null) + return ""; - public static List rect_to_edges(Edge rect) - { - Edge top = new Edge + foreach (var block in pageInfo.Blocks) { - object_type = "rect_edge", - height = 0, - y0 = rect.y1, - bottom = rect.top, - orientation = "h" - }; + if (block.Type != 0) continue; - Edge bottom = new Edge - { - object_type = "rect_edge", - height = 0, - y1 = rect.y0, - top = rect.top + rect.height, - doctop = rect.doctop + rect.height, - orientation = "h" - }; + var blockBbox = block.Bbox; + if (blockBbox == null) continue; - Edge left = new Edge - { - object_type = "rect_edge", - width = 0, - x1 = rect.x0, - orientation = "v" - }; + if (blockBbox.X0 > cell.X1 || blockBbox.X1 < cell.X0 || + blockBbox.Y0 > cell.Y1 || blockBbox.Y1 < cell.Y0) + continue; - Edge right = new Edge - { - object_type = "rect_edge", - width = 0, - x0 = rect.x1, - orientation = "v" - }; + if (block.Lines == null) continue; - return new List { top, bottom, left, right }; - } + foreach (var line in block.Lines) + { + if (line.Bbox == null) continue; - public static List curve_to_edges(Edge curve) - { - // Extract points and other properties from the curve - Point[] points = curve.pts; + var lbbox = line.Bbox; + if (lbbox.X0 > cell.X1 || lbbox.X1 < cell.X0 || + lbbox.Y0 > cell.Y1 || lbbox.Y1 < cell.Y0) + continue; - var edges = new List(); + if (text.Length > 0) + text.Append(markdown ? "
" : "\n"); - for (int i = 0; i < points.Length - 1; i++) - { - Point p0 = points[i]; - Point p1 = points[i + 1]; + var lineDir = line.Dir; + bool horizontal = lineDir != null && + (lineDir.X == 0 && lineDir.Y == 1 || lineDir.X == 1 && lineDir.Y == 0); - var edge = new Edge - { - object_type = "curve_edge", - x0 = Math.Min(p0.X, p1.X), - x1 = Math.Max(p0.X, p1.X), - top = Math.Min(p0.Y, p1.Y), - doctop = Math.Min(p0.Y, p1.Y) + (curve.doctop - curve.top), - bottom = Math.Max(p0.Y, p1.Y), - width = Math.Abs(p0.X - p1.X), - height = Math.Abs(p0.Y - p1.Y), - orientation = (p0.X == p1.X) ? "v" : (p0.Y == p1.Y) ? "h" : null - }; + if (line.Spans == null) continue; + + foreach (var span in line.Spans) + { + if (span.Bbox == null) continue; + + var sbbox = span.Bbox; + if (sbbox.X0 > cell.X1 || sbbox.X1 < cell.X0 || + sbbox.Y0 > cell.Y1 || sbbox.Y1 < cell.Y0) + continue; + + var spanText = new StringBuilder(); + if (span.Chars != null) + { + foreach (var char_ in span.Chars) + { + if (char_.Bbox == null) continue; + + var charRect = new Rect(char_.Bbox); + var cellRect = new Rect(cell.X0, cell.Y0, cell.X1, cell.Y1); + var intersection = charRect & cellRect; + + if (intersection != null && !intersection.IsEmpty && + (intersection.Width * intersection.Height) > 0.5 * (charRect.Width * charRect.Height)) + { + spanText.Append(char_.C); + } + else if (TableGlobals.WHITE_SPACES.Contains(char_.C)) + { + spanText.Append(" "); + } + } + } + else if (!string.IsNullOrEmpty(span.Text)) + { + spanText.Append(span.Text); + } - edges.Add(edge); + if (spanText.Length == 0) continue; + + if (!markdown) + { + text.Append(spanText); + continue; + } + + string prefix = ""; + string suffix = ""; + float flags = span.Flags; + + if (horizontal && ((int)flags & TableGlobals.TEXT_STRIKEOUT) != 0) + { + prefix += "~~"; + suffix = "~~" + suffix; + } + if (((int)flags & TableGlobals.TEXT_BOLD) != 0) + { + prefix += "**"; + suffix = "**" + suffix; + } + if (((int)flags & (int)FontStyle.TEXT_FONT_ITALIC) != 0) + { + prefix += "_"; + suffix = "_" + suffix; + } + if (((int)flags & (int)FontStyle.TEXT_FONT_MONOSPACED) != 0) + { + prefix += "`"; + suffix = "`" + suffix; + } + + string spanTextStr = spanText.ToString(); + if (span.Chars != null && span.Chars.Count > 2) + spanTextStr = spanTextStr.TrimEnd(); + + if (suffix.Length > 0 && text.ToString().EndsWith(suffix)) + { + text.Remove(text.Length - suffix.Length, suffix.Length); + text.Append(spanTextStr + suffix); + } + else + { + if (string.IsNullOrWhiteSpace(spanTextStr)) + text.Append(" "); + else + text.Append(prefix + spanTextStr + suffix); + } + } + } } - return edges; + return text.ToString().Trim(); } - public static List obj_to_edges(Edge obj) + // to_list - Convert collection to list + internal static List ToList(object collection) { - string type = obj.object_type; - - if (type.Contains("_edge")) - { - // If it's an edge object, return it as-is. - return new List { obj }; - } - else if (type == "line") - { - // If it's a line, process it using line_to_edge (you'll need to define line_to_edge method) - return new List { line_to_edge(obj) }; - } - else if (type == "rect") - { - return rect_to_edges(obj); - } - else if (type == "curve") - { - return curve_to_edges(obj); - } - return null; + if (collection is List list) + return list; + if (collection is IEnumerable enumerable) + return enumerable.ToList(); + return new List { (T)collection }; } - // Filter edges based on orientation, type, and minimum length - public static List filter_edges( - List edges, - string orientation = null, - string edgeType = null, - float minLength = 1 - ) + // Helper function for clustering objects + internal static List> ClusterObjects(IEnumerable xs, Func keyFn, float tolerance) { - // Validate orientation - if (orientation != null && orientation != "v" && orientation != "h") - { - throw new ArgumentException("Orientation must be 'v' or 'h'"); - } + if (tolerance == 0) + return xs.OrderBy(keyFn).Select(x => new List { x }).ToList(); - // Function to test if an edge meets the criteria - bool test(Edge e) - { - // Determine the dimension (width or height) based on orientation - float dimension = (e.orientation == "v") ? e.height : e.width; + var xsList = xs.ToList(); + if (xsList.Count < 2) + return xsList.Select(x => new List { x }).ToList(); - bool etCorrect = edgeType == null || e.object_type == edgeType; - bool orientCorrect = orientation == null || e.orientation == orientation; + var values = xsList.Select(keyFn).Distinct().OrderBy(v => v).ToList(); + var clusters = ClusterList(values, tolerance); - return etCorrect && orientCorrect && dimension >= minLength; + var clusterDict = new Dictionary(); + for (int i = 0; i < clusters.Count; i++) + { + foreach (var val in clusters[i]) + clusterDict[val] = i; } - // Use LINQ to filter edges - return edges.Where(test).ToList(); + var grouped = xsList.GroupBy(x => clusterDict[keyFn(x)]).OrderBy(g => g.Key); + return grouped.Select(g => g.ToList()).ToList(); } - public static List> cluster_list(List xs, float tolerance = 0f) + internal static List> ClusterList(List xs, float tolerance = 0) { if (tolerance == 0) - { return xs.OrderBy(x => x).Select(x => new List { x }).ToList(); - } if (xs.Count < 2) - { - return xs.OrderBy(x => x).Select(x => new List { x }).ToList(); - } + return xs.Select(x => new List { x }).ToList(); var groups = new List>(); - xs.Sort(); - var currentGroup = new List { xs[0] }; - float last = xs[0]; + var sortedXs = xs.OrderBy(x => x).ToList(); + var currentGroup = new List { sortedXs[0] }; + float last = sortedXs[0]; - foreach (var x in xs.Skip(1)) + for (int i = 1; i < sortedXs.Count; i++) { - if (x <= last + tolerance) + float x = sortedXs[i]; + if (x <= (last + tolerance)) { currentGroup.Add(x); } else { - groups.Add(new List(currentGroup)); + groups.Add(currentGroup); currentGroup = new List { x }; } last = x; } - groups.Add(currentGroup); return groups; } - public static Dictionary make_cluster_dict(List values, float tolerance) + internal static Rect ObjectsToBbox(IEnumerable objects) { - var clusters = cluster_list(values.Distinct().ToList(), tolerance); - var clusterDict = new Dictionary(); - - var index = 0; - foreach (var cluster in clusters) + var rects = new List(); + foreach (var obj in objects) { - foreach (var value in cluster) + if (obj is CharDict charDict) + { + rects.Add(new Rect(charDict.x0, charDict.top, charDict.x1, charDict.bottom)); + } + else if (obj is Dictionary dict) { - clusterDict[value] = index; + if (dict.ContainsKey("x0") && dict.ContainsKey("top") && dict.ContainsKey("x1") && dict.ContainsKey("bottom")) + { + rects.Add(new Rect( + Convert.ToSingle(dict["x0"]), + Convert.ToSingle(dict["top"]), + Convert.ToSingle(dict["x1"]), + Convert.ToSingle(dict["bottom"]) + )); + } } - index++; } - return clusterDict; - } + if (rects.Count == 0) + return new Rect(0, 0, 0, 0); - public static List> cluster_objects(List xs, Func keyFn, float tolerance) - { - var values = xs.Select(keyFn).ToList(); - var clusterDict = make_cluster_dict(values, tolerance); + return new Rect( + rects.Min(r => r.X0), + rects.Min(r => r.Y0), + rects.Max(r => r.X1), + rects.Max(r => r.Y1) + ); + } + } - var clusterTuples = xs.Select(x => new { Object = x, ClusterId = clusterDict[keyFn(x)] }) - .OrderBy(t => t.ClusterId) - .ToList(); + // TextMap class - maps each unicode character to a char object + internal class TextMap + { + public List> tuples { get; set; } + public string as_string { get; set; } - var grouped = clusterTuples.GroupBy(t => t.ClusterId) - .Select(g => g.Select(t => t.Object).ToList()) - .ToList(); - return grouped; + public TextMap(List> tuples = null) + { + this.tuples = tuples ?? new List>(); + this.as_string = string.Join("", this.tuples.Select(t => t.Item1)); } - public static Edge move_object(Edge obj, string axis, float value) + public Dictionary MatchToDict( + Match m, + int mainGroup = 0, + bool returnGroups = true, + bool returnChars = true) { - // Ensure the axis is valid - if (axis != "h" && axis != "v") - { - throw new ArgumentException("Axis must be 'h' or 'v'", nameof(axis)); - } + var subset = tuples.Skip(m.Groups[mainGroup].Index).Take(m.Groups[mainGroup].Length).ToList(); + var chars = subset.Where(t => t.Item2 != null).Select(t => t.Item2).ToList(); + var bbox = TableHelpers.ObjectsToBbox(chars); - // Prepare the new property values - var newProperties = new List<(string, float)>(); + var result = new Dictionary + { + { "text", m.Groups[mainGroup].Value }, + { "x0", bbox.X0 }, + { "top", bbox.Y0 }, + { "x1", bbox.X1 }, + { "bottom", bbox.Y1 } + }; - if (axis == "h") + if (returnGroups) { - newProperties.Add(("x0", obj.x0 + value)); - newProperties.Add(("x1", obj.x1 + value)); + var groups = new List(); + for (int i = 1; i < m.Groups.Count; i++) + groups.Add(m.Groups[i].Value); + result["groups"] = groups; } - if (axis == "v") - { - newProperties.Add(("top", obj.top + value)); - newProperties.Add(("bottom", obj.bottom + value)); + if (returnChars) + result["chars"] = chars; + + return result; + } + } + + // WordMap class - maps words to chars + internal class WordMap + { + public List, List>> tuples { get; set; } + + public WordMap(List, List>> tuples) + { + this.tuples = tuples; + } + + public TextMap ToTextmap( + bool layout = false, + float layoutWidth = 0, + float layoutHeight = 0, + int layoutWidthChars = 0, + int layoutHeightChars = 0, + float xDensity = TableFlags.TABLE_DEFAULT_X_DENSITY, + float yDensity = TableFlags.TABLE_DEFAULT_Y_DENSITY, + float xShift = 0, + float yShift = 0, + float yTolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE, + bool useTextFlow = false, + bool presorted = false, + bool expandLigatures = true) + { + var textmap = new List>(); - // Handle optional properties if they exist - if (obj.doctop >= 0f) + if (tuples.Count == 0) + return new TextMap(textmap); + + var expansions = expandLigatures ? TableConstants.LIGATURES : new Dictionary(); + + int layoutWidthCharsFinal = layoutWidthChars; + int layoutHeightCharsFinal = layoutHeightChars; + + if (layout) + { + if (layoutWidthChars > 0) { - newProperties.Add(("doctop", obj.doctop + value)); + if (layoutWidth > 0) + throw new ArgumentException("`layout_width` and `layout_width_chars` cannot both be set."); } - - if (obj.y0 >= 0f) + else { - newProperties.Add(("y0", obj.y0 - value)); - newProperties.Add(("y1", obj.y1 - value)); + layoutWidthCharsFinal = (int)Math.Round(layoutWidth / xDensity); } - } - // Create a new MyObject with the updated values - var newObj = new Edge(); - newObj = obj; - - // Update the properties - foreach (var prop in newProperties) - { - // You will need to use reflection or manual assignment for the dynamic property names - switch (prop.Item1) + if (layoutHeightChars > 0) + { + if (layoutHeight > 0) + throw new ArgumentException("`layout_height` and `layout_height_chars` cannot both be set."); + } + else { - case "x0": - newObj.x0 = prop.Item2; - break; - case "x1": - newObj.x1 = prop.Item2; - break; - case "top": - newObj.top = prop.Item2; - break; - case "bottom": - newObj.bottom = prop.Item2; - break; - case "doctop": - newObj.doctop = prop.Item2; - break; - case "y0": - newObj.y0 = prop.Item2; - break; - case "y1": - newObj.y1 = prop.Item2; - break; + layoutHeightCharsFinal = (int)Math.Round(layoutHeight / yDensity); } } - return newObj; - } + var blankLine = layout ? Enumerable.Range(0, layoutWidthCharsFinal) + .Select(_ => Tuple.Create(" ", (CharDict)null)).ToList() : new List>(); + int numNewlines = 0; - public static List snap_objects(List objs, string attr, float tolerance) - { - // Mapping the attribute to the axis (horizontal or vertical) - //string axis = attr switch - //{ - // "x0" => "h", - // "x1" => "h", - // "top" => "v", - // "bottom" => "v", - // _ => throw new ArgumentException("Invalid attribute", nameof(attr)) - //}; - string axis; - switch (attr) - { - case "x0": - case "x1": - axis = "h"; - break; - case "top": - case "bottom": - axis = "v"; - break; - default: - throw new ArgumentException("Invalid attribute", nameof(attr)); - } + var wordsSortedDoctop = presorted || useTextFlow + ? tuples + : tuples.OrderBy(t => Convert.ToSingle(t.Item1["doctop"])).ToList(); - List> clusters = new List>(); - List avgs = new List(); - List> snappedClusters = new List>(); - switch (attr) - { - case "x0": - clusters = cluster_objects(objs, obj => obj.x0, tolerance); - avgs = clusters.Select(cluster => cluster.Average(obj => obj.x0)).ToList(); - snappedClusters = clusters.Select((cluster, idx) => - cluster.Select(obj => move_object(obj, axis, avgs[idx] - (float)obj.x0)).ToList()).ToList(); - break; - case "x1": - clusters = cluster_objects(objs, obj => obj.x1, tolerance); - avgs = clusters.Select(cluster => cluster.Average(obj => obj.x1)).ToList(); - snappedClusters = clusters.Select((cluster, idx) => - cluster.Select(obj => move_object(obj, axis, avgs[idx] - (float)obj.x1)).ToList()).ToList(); - break; - case "top": - clusters = cluster_objects(objs, obj => obj.top, tolerance); - avgs = clusters.Select(cluster => cluster.Average(obj => obj.top)).ToList(); - snappedClusters = clusters.Select((cluster, idx) => - cluster.Select(obj => move_object(obj, axis, avgs[idx] - (float)obj.top)).ToList()).ToList(); - break; - case "bottom": - clusters = cluster_objects(objs, obj => obj.bottom, tolerance); - avgs = clusters.Select(cluster => cluster.Average(obj => obj.bottom)).ToList(); - snappedClusters = clusters.Select((cluster, idx) => - cluster.Select(obj => move_object(obj, axis, avgs[idx] - (float)obj.bottom)).ToList()).ToList(); - break; - default: - return null; - } + if (wordsSortedDoctop.Count == 0) + return new TextMap(textmap); - // Flatten the list of snapped clusters and return - return snappedClusters.SelectMany(cluster => cluster).ToList(); - } + var firstWord = wordsSortedDoctop[0].Item1; + float doctopStart = Convert.ToSingle(firstWord["doctop"]) - Convert.ToSingle(firstWord["top"]); - // Given a list of edges, snap any within `tolerance` pixels of one another - // to their positional average. - public static List snap_edges( - List edges, - float xTolerance = 1.0f, - float yTolerance = 1.0f) - { - // Group edges by orientation - var byOrientation = new Dictionary>() - { - { "v", new List() }, - { "h", new List() } - }; + var clusters = TableHelpers.ClusterObjects(wordsSortedDoctop, t => Convert.ToSingle(t.Item1["doctop"]), yTolerance); - foreach (var edge in edges) + for (int i = 0; i < clusters.Count; i++) { - byOrientation[edge.orientation].Add(edge); - } + var ws = clusters[i]; + float yDist = layout + ? (Convert.ToSingle(ws[0].Item1["doctop"]) - (doctopStart + yShift)) / yDensity + : 0; - // Snap vertical and horizontal edges separately - List snappedV = snap_objects(byOrientation["v"], "x0", xTolerance); - List snappedH = snap_objects(byOrientation["h"], "top", yTolerance); + int numNewlinesPrepend = Math.Max( + i > 0 ? 1 : 0, + (int)Math.Round(yDist) - numNewlines + ); - // Combine and return snapped objects - return snappedV.Concat(snappedH).ToList(); - } + for (int j = 0; j < numNewlinesPrepend; j++) + { + if (textmap.Count == 0 || textmap[textmap.Count - 1].Item1 == "\n") + textmap.AddRange(blankLine); + textmap.Add(Tuple.Create("\n", (CharDict)null)); + } - // Resize the object based on the given key and value - public static Edge resize_object(Edge obj, string key, float value) - { - if (!new[] { "x0", "x1", "top", "bottom" }.Contains(key)) - { - throw new ArgumentException("Invalid key. Must be one of 'x0', 'x1', 'top', 'bottom'.", nameof(key)); - } + numNewlines += numNewlinesPrepend; - Edge newObj = new Edge(); - newObj = obj; + int lineLen = 0; + var lineWordsSortedX0 = presorted || useTextFlow + ? ws + : ws.OrderBy(t => Convert.ToSingle(t.Item1["x0"])).ToList(); - if (key == "x0") - { - if (value > obj.x1) throw new ArgumentException("x0 must be less than or equal to x1."); - newObj.x0 = value; - newObj.width = obj.x1 - value; - } - else if (key == "x1") - { - if (value < obj.x0) throw new ArgumentException("x1 must be greater than or equal to x0."); - newObj.x1 = value; - newObj.width = value - obj.x0; - } - else if (key == "top") - { - if (value > obj.bottom) throw new ArgumentException("top must be less than or equal to bottom."); - float oldValue = obj.top; - float diff = value - oldValue; - newObj.top = value; - newObj.doctop = obj.doctop + diff; - newObj.height = obj.height - diff; - if (obj.y1 >= 0f) - newObj.y1 = obj.y1 - diff; - } - else if (key == "bottom") - { - if (value < obj.top) throw new ArgumentException("bottom must be greater than or equal to top."); - float oldValue = obj.bottom; - float diff = value - oldValue; - newObj.bottom = value; - newObj.height = obj.height + diff; - if (obj.y0 >= 0f) - newObj.y0 = obj.y0 - diff; - } - - // Return a new object with the updated properties - return newObj; - } - - // Given a list of edges along the same infinite line, join those that - // are within `tolerance` pixels of one another. - public static List join_edge_group(List edges, string orientation, float tolerance = TableFlags.TABLE_DEFAULT_JOIN_TOLERANCE) - { - List joined = new List(); - if (orientation == "h") - { - // Sort edges by the min property - var sortedEdges = edges.OrderBy(e => e.x0).ToList(); - joined = new List { sortedEdges[0] }; - - foreach (var e in sortedEdges.Skip(1)) + foreach (Tuple, List> tuple in lineWordsSortedX0) { - var last = joined.Last(); - if (e.x0 <= last.x1 + tolerance) + var word = tuple.Item1; + var chars = tuple.Item2; + float xDist = layout ? (Convert.ToSingle(word["x0"]) - xShift) / xDensity : 0; + int numSpacesPrepend = Math.Max(Math.Min(1, lineLen), (int)Math.Round(xDist) - lineLen); + + for (int k = 0; k < numSpacesPrepend; k++) + textmap.Add(Tuple.Create(" ", (CharDict)null)); + lineLen += numSpacesPrepend; + + foreach (var c in chars) { - if (e.x1 > last.x1) + string letters = expansions.ContainsKey(c.text) ? expansions[c.text] : c.text; + foreach (char letter in letters) { - // Extend current edge to new extremity - joined[joined.Count - 1] = resize_object(last, "x1", e.x1); + textmap.Add(Tuple.Create(letter.ToString(), c)); + lineLen++; } } - else - { - // Edge is separate from the previous edge - joined.Add(e); - } } - } - else if (orientation == "v") - { - // Sort edges by the min property - var sortedEdges = edges.OrderBy(e => e.top).ToList(); - joined = new List { sortedEdges[0] }; - foreach (var e in sortedEdges.Skip(1)) + if (layout) { - var last = joined.Last(); - if (e.top <= last.bottom + tolerance) - { - if (e.bottom > last.bottom) - { - // Extend current edge to new extremity - joined[joined.Count - 1] = resize_object(last, "bottom", e.bottom); - } - } - else - { - // Edge is separate from the previous edge - joined.Add(e); - } + for (int k = 0; k < layoutWidthCharsFinal - lineLen; k++) + textmap.Add(Tuple.Create(" ", (CharDict)null)); } } - else + + if (layout) { - throw new ArgumentException("Orientation must be 'v' or 'h'", nameof(orientation)); + int numNewlinesAppend = layoutHeightCharsFinal - (numNewlines + 1); + for (int i = 0; i < numNewlinesAppend; i++) + { + if (i > 0) + textmap.AddRange(blankLine); + textmap.Add(Tuple.Create("\n", (CharDict)null)); + } + + if (textmap.Count > 0 && textmap[textmap.Count - 1].Item1 == "\n") + textmap.RemoveAt(textmap.Count - 1); } - return joined; + return new TextMap(textmap); } + } - // Using the `snap_edges` and `join_edge_group` methods above, - // merge a list of edges into a more "seamless" list. - public static List merge_edges( - List edges, - float snap_x_tolerance, - float snap_y_tolerance, - float join_x_tolerance, - float join_y_tolerance) - { - (string, float) get_group(Edge edge) - { - if (edge.orientation == "h") - return ("h", edge.top); - else - return ("v", edge.x0); - } + // WordExtractor class + internal class WordExtractor + { + public float x_tolerance { get; set; } + public float y_tolerance { get; set; } + public bool keep_blank_chars { get; set; } + public bool use_text_flow { get; set; } + public bool horizontal_ltr { get; set; } + public bool vertical_ttb { get; set; } + public List extra_attrs { get; set; } + public string split_at_punctuation { get; set; } + public Dictionary expansions { get; set; } - // Snap edges if tolerance values are greater than 0 - if (snap_x_tolerance > 0 || snap_y_tolerance > 0) - { - edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance); - } -/* - // Group edges by orientation - var edgeGroups = edges - .OrderBy(e => e.orientation == "h" ? e.top : e.x0) - .GroupBy(e => e.orientation == "h" ? "h" : "v"); -*/ - // Sort edges by group (orientation + position) - var _sorted = edges.OrderBy(e => e.orientation) - .ThenBy(e => e.orientation == "h" ? e.top : e.x0) - .ToList(); - - // Group edges by the same group key - var edgeGroups = _sorted - .GroupBy(get_group) - .ToList(); + public WordExtractor( + float x_tolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE, + float y_tolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE, + bool keep_blank_chars = false, + bool use_text_flow = false, + bool horizontal_ltr = true, + bool vertical_ttb = false, + List extra_attrs = null, + bool split_at_punctuation = false, + bool expand_ligatures = true) + { + this.x_tolerance = x_tolerance; + this.y_tolerance = y_tolerance; + this.keep_blank_chars = keep_blank_chars; + this.use_text_flow = use_text_flow; + this.horizontal_ltr = horizontal_ltr; + this.vertical_ttb = vertical_ttb; + this.extra_attrs = extra_attrs ?? new List(); + this.split_at_punctuation = split_at_punctuation + ? "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + : ""; + this.expansions = expand_ligatures ? TableConstants.LIGATURES : new Dictionary(); + } + + public Dictionary MergeChars(List orderedChars) + { + var bbox = TableHelpers.ObjectsToBbox(orderedChars); + float doctopAdj = orderedChars[0].doctop - orderedChars[0].top; + bool upright = orderedChars[0].upright; + int direction = (upright ? horizontal_ltr : vertical_ttb) ? 1 : -1; - // Join edges by their groups - var joinedEdges = new List(); - foreach (var group in edgeGroups) + var matrix = orderedChars[0].matrix; + int rotation = 0; + + if (!upright && matrix.Item2 < 0) { - float tolerance = group.Key.Item1 == "h" ? join_x_tolerance : join_y_tolerance; - joinedEdges.AddRange(join_edge_group(group.ToList(), group.Key.Item1, tolerance)); + orderedChars = orderedChars.AsEnumerable().Reverse().ToList(); + rotation = 270; } - return joinedEdges; - } + if (matrix.Item1 < 0 && matrix.Item4 < 0) + rotation = 180; + else if (matrix.Item2 > 0) + rotation = 90; - // Return the rectangle(i.e a dict with keys "x0", "top", "x1", - // "bottom") for an object. - public static Dictionary bbox_to_rect(BBox bbox) - { - var rect = new Dictionary - { - { "x0", bbox.x0 }, - { "top", bbox.top }, - { "x1", bbox.x1 }, - { "bottom", bbox.bottom } + var word = new Dictionary + { + { "text", string.Join("", orderedChars.Select(c => expansions.ContainsKey(c.text) ? expansions[c.text] : c.text)) }, + { "x0", bbox.X0 }, + { "x1", bbox.X1 }, + { "top", bbox.Y0 }, + { "doctop", bbox.Y0 + doctopAdj }, + { "bottom", bbox.Y1 }, + { "upright", upright }, + { "direction", direction }, + { "rotation", rotation } }; - return rect; - } + foreach (var key in extra_attrs) + { + if (orderedChars.Count > 0) + { + var prop = typeof(CharDict).GetProperty(key); + if (prop != null) + word[key] = prop.GetValue(orderedChars[0]); + } + } - // Given an iterable of objects, return the smallest rectangle(i.e.a - // dict with "x0", "top", "x1", and "bottom" keys) that contains them - // all. - public static Dictionary objects_to_rect(IEnumerable objects) - { - BBox bbox = objects_to_bbox(objects); - return bbox_to_rect(bbox); + return word; } - // Given an iterable of bounding boxes, return the smallest bounding box - // that contains them all. - public static BBox merge_bboxes(List bboxes) + public bool CharBeginsNewWord(CharDict prevChar, CharDict currChar) { - if (bboxes.Count > 0) + if (currChar.upright) { - var x0 = bboxes.Select(b => b.x0).Min(); - var top = bboxes.Select(b => b.top).Min(); - var x1 = bboxes.Select(b => b.x1).Max(); - var bottom = bboxes.Select(b => b.bottom).Max(); + float x = x_tolerance; + float y = y_tolerance; + float ay = prevChar.top; + float cy = currChar.top; + float ax, bx, cx; + + if (horizontal_ltr) + { + ax = prevChar.x0; + bx = prevChar.x1; + cx = currChar.x0; + } + else + { + ax = -prevChar.x1; + bx = -prevChar.x0; + cx = -currChar.x1; + } - return new BBox(x0, top, x1, bottom); + return (cx < ax) || (cx > bx + x) || (cy > ay + y); } else { - return new BBox(0, 0, 0, 0); + float x = y_tolerance; + float y = x_tolerance; + float ay = prevChar.x0; + float cy = currChar.x0; + float ax, bx, cx; + + if (vertical_ttb) + { + ax = prevChar.top; + bx = prevChar.bottom; + cx = currChar.top; + } + else + { + ax = -prevChar.bottom; + bx = -prevChar.top; + cx = -currChar.bottom; + } + + return (cx < ax) || (cx > bx + x) || (cy > ay + y); } } - // Given an iterable of objects, return the smallest bounding box that - // contains them all. - public static BBox objects_to_bbox(IEnumerable objects) + public IEnumerable> IterCharsToWords(IEnumerable orderedChars) { - List bboxes = new List(); - foreach (var obj in objects) + var currentWord = new List(); + + foreach (var char_ in orderedChars) { - if (obj is Character) + string text = char_.text; + + if (!keep_blank_chars && string.IsNullOrWhiteSpace(text)) + { + if (currentWord.Count > 0) + { + yield return currentWord; + currentWord = new List(); + } + } + else if (split_at_punctuation.Contains(text)) + { + currentWord.Add(char_); + yield return currentWord; + currentWord = new List(); + } + else if (currentWord.Count > 0 && CharBeginsNewWord(currentWord[currentWord.Count - 1], char_)) { - Character ch = obj as Character; - bboxes.Add(new BBox(ch.x0, ch.top, ch.x1, ch.bottom)); + yield return currentWord; + currentWord = new List { char_ }; } else { - bboxes.Add(obj as BBox); + currentWord.Add(char_); } } - return merge_bboxes(bboxes); + + if (currentWord.Count > 0) + yield return currentWord; } - // Find(imaginary) horizontal lines that connect the tops - // of at least `word_threshold` words. - public static List words_to_edges_h(List words, int wordThreshold = (int)TableFlags.TABLE_DEFAULT_MIN_WORDS_HORIZONTAL) + public IEnumerable IterSortChars(IEnumerable chars) { - // Cluster the words by 'top' value (simulating `cluster_objects`) - var byTop = cluster_objects(words, obj => obj.top, 1); + var charsList = chars.ToList(); + var uprightClusters = TableHelpers.ClusterObjects(charsList, c => c.upright ? -1 : 0, 0); - // Filter clusters by the word threshold - List> largeClusters = byTop.Where(cluster => cluster.Count >= wordThreshold).ToList(); - - // Convert clusters to bounding rectangles - var rects = largeClusters.Select(c => objects_to_bbox(c)).ToList(); - - if (rects.Count == 0) - return new List(); - - // Find min and max x0 and x1 values - float minX0 = rects.Min(r => r.x0); - float maxX1 = rects.Max(r => r.x1); + foreach (var uprightCluster in uprightClusters) + { + bool upright = uprightCluster[0].upright; + string clusterKey = upright ? "doctop" : "x0"; - List edges = new List(); + var subclusters = TableHelpers.ClusterObjects(uprightCluster, c => GetCharValue(c, clusterKey), y_tolerance); - foreach (var r in rects) - { - // Add the 'top' edge for each detected row - edges.Add(new Edge + foreach (var sc in subclusters) { - x0 = minX0, - x1 = maxX1, - top = r.top, - bottom = r.top, - width = maxX1 - minX0, - orientation = "h" - }); + string sortKey = upright ? "x0" : "doctop"; + var toYield = sc.OrderBy(c => GetCharValue(c, sortKey)).ToList(); - // Add the 'bottom' edge for each detected row (catches last row) - edges.Add(new Edge - { - x0 = minX0, - x1 = maxX1, - top = r.bottom, - bottom = r.bottom, - width = maxX1 - minX0, - orientation = "h" - }); - } + if (!(upright ? horizontal_ltr : vertical_ttb)) + toYield.Reverse(); - return edges; + foreach (var c in toYield) + yield return c; + } + } } - public static BBox get_bbox_overlap(BBox a, BBox b) + private float GetCharValue(CharDict c, string key) { - float oLeft = Math.Max(a.x0, b.x0); - float oRight = Math.Min(a.x1, b.x1); - float oBottom = Math.Min(a.bottom, b.bottom); - float oTop = Math.Max(a.top, b.top); - - float oWidth = oRight - oLeft; - float oHeight = oBottom - oTop; - - if (oHeight >= 0 && oWidth >= 0 && oHeight + oWidth > 0) + switch (key) { - return new BBox(oLeft, oTop, oRight, oBottom); + case "x0": + return c.x0; + case "doctop": + return c.doctop; + case "top": + return c.top; + default: + return 0; } - return null; } - // Find(imaginary) vertical lines that connect the left, right, or - // center of at least `word_threshold` words. - public static List words_to_edges_v(List words, int wordThreshold = (int)TableFlags.TABLE_DEFAULT_MIN_WORDS_VERTICAL) + public IEnumerable, List>> IterExtractTuples(IEnumerable chars) { - // Find words that share the same left, right, or centerpoints - var byX0 = cluster_objects(words, w => w.x0, 1); - var byX1 = cluster_objects(words, w => w.x1, 1); - var byCenter = cluster_objects(words, w => (w.x0 + w.x1) / 2, 1); - - var clusters = byX0.Concat(byX1).Concat(byCenter).ToList(); - - // Find the points that align with the most words - var sortedClusters = clusters.OrderByDescending(c => c.Count).ToList(); - var largeClusters = sortedClusters.Where(c => c.Count >= wordThreshold).ToList(); + var orderedChars = use_text_flow ? chars : IterSortChars(chars); + var groupedChars = orderedChars.GroupBy(c => new { c.upright }); - // For each of those points, find the bboxes fitting all matching words - var bboxes = largeClusters.Select(c => objects_to_bbox(c)).ToList(); - - // Iterate through those bboxes, condensing overlapping bboxes - var condensedBboxes = new List(); - foreach (var bbox in bboxes) + foreach (var group in groupedChars) { - bool overlap = condensedBboxes.Any(existingBbox => get_bbox_overlap(bbox, existingBbox) != null); - if (!overlap) + foreach (var wordChars in IterCharsToWords(group)) { - condensedBboxes.Add(bbox); + yield return Tuple.Create(MergeChars(wordChars), wordChars); } } + } - if (condensedBboxes.Count == 0) - { - return new List(); - } - - var condensedRects = condensedBboxes.Select(b => bbox_to_rect(b)).ToList(); + public WordMap ExtractWordmap(IEnumerable chars) + { + return new WordMap(IterExtractTuples(chars).ToList()); + } - // Sort rectangles by x0. - var sortedRects = condensedRects.OrderBy(r => r["x0"]).ToList(); + public List> ExtractWords(IEnumerable chars) + { + return IterExtractTuples(chars).Select(t => t.Item1).ToList(); + } + } - float maxX1 = sortedRects.Max(r => r["x1"]); - float minTop = sortedRects.Min(r => r["top"]); - float maxBottom = sortedRects.Max(r => r["bottom"]); + // Helper functions for text extraction + internal static class TextExtractionHelpers + { + internal static List> ExtractWords(List chars, Dictionary kwargs = null) + { + if (kwargs == null) kwargs = new Dictionary(); + var extractor = new WordExtractor( + x_tolerance: kwargs.ContainsKey("x_tolerance") ? Convert.ToSingle(kwargs["x_tolerance"]) : TableFlags.TABLE_DEFAULT_X_TOLERANCE, + y_tolerance: kwargs.ContainsKey("y_tolerance") ? Convert.ToSingle(kwargs["y_tolerance"]) : TableFlags.TABLE_DEFAULT_Y_TOLERANCE, + keep_blank_chars: kwargs.ContainsKey("keep_blank_chars") && (bool)kwargs["keep_blank_chars"], + use_text_flow: kwargs.ContainsKey("use_text_flow") && (bool)kwargs["use_text_flow"], + horizontal_ltr: !kwargs.ContainsKey("horizontal_ltr") || (bool)kwargs["horizontal_ltr"], + vertical_ttb: kwargs.ContainsKey("vertical_ttb") && (bool)kwargs["vertical_ttb"], + split_at_punctuation: kwargs.ContainsKey("split_at_punctuation") && (bool)kwargs["split_at_punctuation"], + expand_ligatures: !kwargs.ContainsKey("expand_ligatures") || (bool)kwargs["expand_ligatures"] + ); + return extractor.ExtractWords(chars); + } - // Create edges based on the rectangles. - var edges = sortedRects.Select(b => new Edge - { - x0 = b["x0"], - x1 = b["x0"], - top = minTop, - bottom = maxBottom, - height = maxBottom - minTop, - orientation = "v" - }).ToList(); + internal static TextMap CharsToTextmap(List chars, Dictionary kwargs = null) + { + if (kwargs == null) kwargs = new Dictionary(); + kwargs["presorted"] = true; - edges.Add(new Edge - { - x0 = maxX1, - x1 = maxX1, - top = minTop, - bottom = maxBottom, - height = maxBottom - minTop, - orientation = "v" - }); + var extractor = new WordExtractor( + x_tolerance: kwargs.ContainsKey("x_tolerance") ? Convert.ToSingle(kwargs["x_tolerance"]) : TableFlags.TABLE_DEFAULT_X_TOLERANCE, + y_tolerance: kwargs.ContainsKey("y_tolerance") ? Convert.ToSingle(kwargs["y_tolerance"]) : TableFlags.TABLE_DEFAULT_Y_TOLERANCE, + keep_blank_chars: kwargs.ContainsKey("keep_blank_chars") && (bool)kwargs["keep_blank_chars"], + use_text_flow: kwargs.ContainsKey("use_text_flow") && (bool)kwargs["use_text_flow"], + expand_ligatures: !kwargs.ContainsKey("expand_ligatures") || (bool)kwargs["expand_ligatures"] + ); - return edges; + var wordmap = extractor.ExtractWordmap(chars); + return wordmap.ToTextmap( + layout: kwargs.ContainsKey("layout") && (bool)kwargs["layout"], + layoutWidth: kwargs.ContainsKey("layout_width") ? Convert.ToSingle(kwargs["layout_width"]) : 0, + layoutHeight: kwargs.ContainsKey("layout_height") ? Convert.ToSingle(kwargs["layout_height"]) : 0, + layoutWidthChars: kwargs.ContainsKey("layout_width_chars") ? Convert.ToInt32(kwargs["layout_width_chars"]) : 0, + layoutHeightChars: kwargs.ContainsKey("layout_height_chars") ? Convert.ToInt32(kwargs["layout_height_chars"]) : 0, + xDensity: kwargs.ContainsKey("x_density") ? Convert.ToSingle(kwargs["x_density"]) : TableFlags.TABLE_DEFAULT_X_DENSITY, + yDensity: kwargs.ContainsKey("y_density") ? Convert.ToSingle(kwargs["y_density"]) : TableFlags.TABLE_DEFAULT_Y_DENSITY, + xShift: kwargs.ContainsKey("x_shift") ? Convert.ToSingle(kwargs["x_shift"]) : 0, + yShift: kwargs.ContainsKey("y_shift") ? Convert.ToSingle(kwargs["y_shift"]) : 0, + yTolerance: kwargs.ContainsKey("y_tolerance") ? Convert.ToSingle(kwargs["y_tolerance"]) : TableFlags.TABLE_DEFAULT_Y_TOLERANCE, + useTextFlow: kwargs.ContainsKey("use_text_flow") && (bool)kwargs["use_text_flow"], + presorted: kwargs.ContainsKey("presorted") && (bool)kwargs["presorted"], + expandLigatures: !kwargs.ContainsKey("expand_ligatures") || (bool)kwargs["expand_ligatures"] + ); } - // Given a list of edges, return the points at which they intersect - // within `tolerance` pixels. - public class Intersection + internal static string ExtractText(List chars, Dictionary kwargs = null) { - public float x0 { get; set; } - public float top { get; set; } - public float x1 { get; set; } - public float bottom { get; set; } - public List VerticalEdges { get; set; } - public List HorizontalEdges { get; set; } + if (kwargs == null) kwargs = new Dictionary(); + var charsList = TableHelpers.ToList(chars); + if (charsList.Count == 0) + return ""; - public Intersection() - { - this.VerticalEdges = new List(); - this.HorizontalEdges = new List(); - } - } + if (kwargs.ContainsKey("layout") && (bool)kwargs["layout"]) + return CharsToTextmap(charsList, kwargs).as_string; - public static Dictionary edges_to_intersections( - List edges, float x_tolerance = 1.0f, float y_tolerance = 1.0f) - { - var intersections = new Dictionary(); + float yTolerance = kwargs.ContainsKey("y_tolerance") ? Convert.ToSingle(kwargs["y_tolerance"]) : TableFlags.TABLE_DEFAULT_Y_TOLERANCE; + var extractor = new WordExtractor( + x_tolerance: kwargs.ContainsKey("x_tolerance") ? Convert.ToSingle(kwargs["x_tolerance"]) : TableFlags.TABLE_DEFAULT_X_TOLERANCE, + y_tolerance: yTolerance, + keep_blank_chars: kwargs.ContainsKey("keep_blank_chars") && (bool)kwargs["keep_blank_chars"], + use_text_flow: kwargs.ContainsKey("use_text_flow") && (bool)kwargs["use_text_flow"], + expand_ligatures: !kwargs.ContainsKey("expand_ligatures") || (bool)kwargs["expand_ligatures"] + ); - // Separate vertical and horizontal edges - var vEdges = edges.Where(e => e.orientation == "v").ToList(); - var hEdges = edges.Where(e => e.orientation == "h").ToList(); + var words = extractor.ExtractWords(charsList); + if (words.Count == 0) + return ""; - // Sort edges (vertical by X0 then Top, horizontal by Top then X0) - vEdges = vEdges.OrderBy(e => e.x0).ThenBy(e => e.top).ToList(); - hEdges = hEdges.OrderBy(e => e.top).ThenBy(e => e.x0).ToList(); + int rotation = words[0].ContainsKey("rotation") ? Convert.ToInt32(words[0]["rotation"]) : 0; - foreach (var v in vEdges) + if (rotation == 90) { - foreach (var h in hEdges) + words = words.OrderBy(w => Convert.ToSingle(w["x1"])).ThenByDescending(w => Convert.ToSingle(w["top"])).ToList(); + return string.Join(" ", words.Select(w => w["text"].ToString())); + } + else if (rotation == 270) + { + words = words.OrderByDescending(w => Convert.ToSingle(w["x1"])).ThenBy(w => Convert.ToSingle(w["top"])).ToList(); + return string.Join(" ", words.Select(w => w["text"].ToString())); + } + else + { + var lines = TableHelpers.ClusterObjects(words, w => Convert.ToSingle(w["doctop"]), yTolerance); + var result = string.Join("\n", lines.Select(line => string.Join(" ", line.Select(w => w["text"].ToString())))); + if (rotation == 180) { - // Check if the vertical and horizontal lines intersect within tolerance - if (v.top <= h.top + y_tolerance && v.bottom >= h.top - y_tolerance && - v.x0 >= h.x0 - x_tolerance && v.x0 <= h.x1 + x_tolerance) - { - var vertex = new Point(v.x0, h.top); - - if (!intersections.ContainsKey(vertex)) - { - intersections[vertex] = new Intersection(); - } - - intersections[vertex].VerticalEdges.Add(v); - intersections[vertex].HorizontalEdges.Add(h); - } + var charArray = result.ToCharArray(); + Array.Reverse(charArray); + return new string(charArray.Select(c => c == '\n' ? ' ' : c).ToArray()); } + return result; } - - return intersections; } - // Return the bounding box for an object. - static BBox obj_to_bbox(Edge edge) + internal static string CollateLine(List lineChars, float tolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE) { - return new BBox(edge.x0, edge.top, edge.x1, edge.bottom); + var coll = new StringBuilder(); + float? lastX1 = null; + foreach (var char_ in lineChars.OrderBy(c => c.x0)) + { + if (lastX1.HasValue && char_.x0 > (lastX1.Value + tolerance)) + coll.Append(" "); + lastX1 = char_.x1; + coll.Append(char_.text); + } + return coll.ToString(); } - - // Given a list of points(`intersections`), return all rectangular "cells" - // that those points describe. - // `intersections` should be a dictionary with (x0, top) tuples as keys, - // and a list of edge objects as values.The edge objects should correspond - // to the edges that touch the intersection. - public static List intersections_to_cells(Dictionary intersections) + internal static List DedupeChars(List chars, float tolerance = 1) { - var points = intersections.Keys.OrderBy(p => p.X).ToList(); - int nPoints = points.Count; - - bool edge_connects(Point p1, Point p2) - { - HashSet edges_to_set(List edges) - { - return new HashSet(edges.Select(obj_to_bbox)); - } + var key = new Func(c => new { c.fontname, c.size, c.upright, c.text }); + var posKey = new Func(c => new { c.doctop, c.x0 }); - if (p1.X == p2.X) - { - var key1 = new Point(-1,-1); - var key2 = new Point(-1,-1); - foreach (var ikey in intersections.Keys) - { - if (ikey.EqualTo(p1)) - { - key1 = ikey; - } - if (ikey.EqualTo(p2)) - { - key2 = ikey; - } - } - if (key1.X < 0 || key2.X < 0) - { - return false; - } - var common = edges_to_set(intersections[key1].VerticalEdges).Intersect(edges_to_set(intersections[key2].VerticalEdges)).ToList(); - if (common.Any()) return true; - } + var sortedChars = chars.OrderBy(key).ToList(); + var uniqueChars = new List(); - if (p1.Y == p2.Y) + foreach (var group in sortedChars.GroupBy(key)) + { + var yClusters = TableHelpers.ClusterObjects(group.ToList(), c => c.doctop, tolerance); + foreach (var yCluster in yClusters) { - var key1 = new Point(-1, -1); - var key2 = new Point(-1, -1); - foreach (var ikey in intersections.Keys) - { - if (ikey.EqualTo(p1)) - { - key1 = ikey; - } - if (ikey.EqualTo(p2)) - { - key2 = ikey; - } - } - if (key1.X < 0 || key2.X < 0) + var xClusters = TableHelpers.ClusterObjects(yCluster, c => c.x0, tolerance); + foreach (var xCluster in xClusters) { - return false; + uniqueChars.Add(xCluster.OrderBy(c => posKey(c)).First()); } - var common = edges_to_set(intersections[key1].HorizontalEdges).Intersect(edges_to_set(intersections[key2].HorizontalEdges)).ToList(); - if (common.Any()) return true; } - - return false; } - BBox find_smallest_cell(int i) - { - if (i == nPoints - 1) return null; + return uniqueChars.OrderBy(c => chars.IndexOf(c)).ToList(); + } + } - var pt = points[i]; - var rest = points.Skip(i + 1).ToList(); - // Get all the points directly below and directly right - var below = rest.Where(x => x.X == pt.X).ToList(); - var right = rest.Where(x => x.Y == pt.Y).ToList(); + // Edge processing functions + internal static class EdgeProcessing + { + // line_to_edge - Convert line to edge + internal static Edge LineToEdge(Dictionary line) + { + var edge = new Edge + { + x0 = Convert.ToSingle(line["x0"]), + x1 = Convert.ToSingle(line["x1"]), + top = Convert.ToSingle(line["top"]), + bottom = Convert.ToSingle(line["bottom"]), + width = line.ContainsKey("width") ? Convert.ToSingle(line["width"]) : 0, + height = line.ContainsKey("height") ? Convert.ToSingle(line["height"]) : 0, + orientation = Convert.ToSingle(line["top"]) == Convert.ToSingle(line["bottom"]) ? "h" : "v", + object_type = line.ContainsKey("object_type") ? line["object_type"].ToString() : "line", + doctop = line.ContainsKey("doctop") ? Convert.ToSingle(line["doctop"]) : 0, + page_number = line.ContainsKey("page_number") ? Convert.ToInt32(line["page_number"]) : 0, + y0 = line.ContainsKey("y0") ? Convert.ToSingle(line["y0"]) : 0, + y1 = line.ContainsKey("y1") ? Convert.ToSingle(line["y1"]) : 0 + }; + return edge; + } - foreach (var belowPt in below) - { - if (!edge_connects(pt, belowPt)) continue; + // rect_to_edges - Convert rectangle to 4 edges + internal static List RectToEdges(Dictionary rect) + { + var edges = new List(); + float x0 = Convert.ToSingle(rect["x0"]); + float top = Convert.ToSingle(rect["top"]); + float x1 = Convert.ToSingle(rect["x1"]); + float bottom = Convert.ToSingle(rect["bottom"]); + float width = x1 - x0; + float height = bottom - top; + float doctop = rect.ContainsKey("doctop") ? Convert.ToSingle(rect["doctop"]) : top; + + // Top edge + edges.Add(new Edge + { + x0 = x0, + x1 = x1, + top = bottom, + bottom = top, + width = width, + height = 0, + orientation = "h", + object_type = "rect_edge", + doctop = doctop, + y0 = bottom, + y1 = top + }); - foreach (var rightPt in right) - { - if (!edge_connects(pt, rightPt)) continue; - - Point bottomRight = new Point(rightPt.X, belowPt.Y); - - if (intersections.Keys.Any(p => p.EqualTo(rightPt)) - && edge_connects(bottomRight, rightPt) - && edge_connects(bottomRight, belowPt)) - { - return new BBox(pt.X, pt.Y, bottomRight.X, bottomRight.Y); - } - } - } + // Bottom edge + edges.Add(new Edge + { + x0 = x0, + x1 = x1, + top = top + height, + bottom = top + height, + width = width, + height = 0, + orientation = "h", + object_type = "rect_edge", + doctop = doctop + height, + y0 = top + height, + y1 = top + height + }); - return null; - } + // Left edge + edges.Add(new Edge + { + x0 = x0, + x1 = x0, + top = top, + bottom = bottom, + width = 0, + height = height, + orientation = "v", + object_type = "rect_edge", + doctop = doctop, + y0 = bottom, + y1 = top + }); - List bBoxes = new List(); - for (int i = 0; i < points.Count; i++) + // Right edge + edges.Add(new Edge { - BBox bbox = find_smallest_cell(i); - if (bbox != null) - bBoxes.Add(bbox); - } - return bBoxes; + x0 = x1, + x1 = x1, + top = top, + bottom = bottom, + width = 0, + height = height, + orientation = "v", + object_type = "rect_edge", + doctop = doctop, + y0 = bottom, + y1 = top + }); + + return edges; } - // Given a list of bounding boxes(`cells`), return a list of tables that - // hold those cells most simply(and contiguously). - public static List> cells_to_tables(Page page, List cells) + // curve_to_edges - Convert curve to edges + internal static List CurveToEdges(Dictionary curve) { - List bbox_to_corners(BBox bbox) + var edges = new List(); + var pts = curve["pts"] as List; + if (pts == null) return edges; + + float doctop = curve.ContainsKey("doctop") ? Convert.ToSingle(curve["doctop"]) : 0; + float top = curve.ContainsKey("top") ? Convert.ToSingle(curve["top"]) : 0; + + for (int i = 0; i < pts.Count - 1; i++) { - // Decompose the bounding box into its individual components - float x0 = bbox.x0; - float top = bbox.top; - float x1 = bbox.x1; - float bottom = bbox.bottom; + var p0Obj = pts[i] as List; + var p1Obj = pts[i + 1] as List; + if (p0Obj == null || p1Obj == null || p0Obj.Count < 2 || p1Obj.Count < 2) + continue; + + float p0x = Convert.ToSingle(p0Obj[0]); + float p0y = Convert.ToSingle(p0Obj[1]); + float p1x = Convert.ToSingle(p1Obj[0]); + float p1y = Convert.ToSingle(p1Obj[1]); + + string orientation = null; + if (p0x == p1x) + orientation = "v"; + else if (p0y == p1y) + orientation = "h"; + + if (orientation == null) continue; - // Return the four corners as a list of tuples - return new List + edges.Add(new Edge { - new Point(x0, top), - new Point(x0, bottom), - new Point(x1, top), - new Point(x1, bottom) - }; + x0 = Math.Min(p0x, p1x), + x1 = Math.Max(p0x, p1x), + top = Math.Min(p0y, p1y), + bottom = Math.Max(p0y, p1y), + width = Math.Abs(p0x - p1x), + height = Math.Abs(p0y - p1y), + orientation = orientation, + object_type = "curve_edge", + doctop = Math.Min(p0y, p1y) + (doctop - top), + y0 = Math.Max(p0y, p1y), + y1 = Math.Min(p0y, p1y) + }); } - List remainingCells = new List(cells); - List> tables = new List>(); + return edges; + } - // Iterate through the cells found above, and assign them - // to contiguous tables - HashSet currentCorners = new HashSet(); - List currentCells = new List(); + // obj_to_edges - Convert object to edges + internal static List ObjToEdges(Dictionary obj) + { + string objType = obj.ContainsKey("object_type") ? obj["object_type"].ToString() : ""; + + if (objType.Contains("_edge")) + return new List { LineToEdge(obj) }; + else if (objType == "line") + return new List { LineToEdge(obj) }; + else if (objType == "rect") + return RectToEdges(obj); + else if (objType == "curve") + return CurveToEdges(obj); + + return new List(); + } - while (remainingCells.Count > 0) + // filter_edges - Filter edges by orientation, type, and min length + internal static List FilterEdges( + List edges, + string orientation = null, + string edgeType = null, + float minLength = 1) + { + if (orientation != null && orientation != "v" && orientation != "h") + throw new ArgumentException("Orientation must be 'v' or 'h'"); + + return edges.Where(e => { - int initialCellCount = currentCells.Count; + string dim = e.orientation == "v" ? "height" : "width"; + float dimValue = e.orientation == "v" ? e.height : e.width; + bool etCorrect = edgeType == null || e.object_type == edgeType; + bool orientCorrect = orientation == null || e.orientation == orientation; + return etCorrect && orientCorrect && dimValue >= minLength; + }).ToList(); + } - foreach (var cell in new List(remainingCells)) + // snap_objects - Snap objects to their average position + internal static List> SnapObjects( + IEnumerable> objs, + string attr, + float tolerance) + { + string axis = attr == "x0" || attr == "x1" ? "h" : "v"; + var objsList = objs.ToList(); + var clusters = TableHelpers.ClusterObjects(objsList, obj => Convert.ToSingle(obj[attr]), tolerance); + var avgs = clusters.Select(cluster => cluster.Average(obj => Convert.ToSingle(obj[attr]))).ToList(); + + var snappedClusters = new List>>(); + for (int i = 0; i < clusters.Count; i++) + { + float avg = avgs[i]; + var snapped = clusters[i].Select(obj => { - List cellCorners = bbox_to_corners(cell); - // If we're just starting a table ... - if (currentCells.Count == 0) + var newObj = new Dictionary(obj); + float oldValue = Convert.ToSingle(obj[attr]); + float diff = avg - oldValue; + + if (axis == "h") { - // ... immediately assign it to the empty group - currentCorners.UnionWith(cellCorners); - currentCells.Add(cell); - remainingCells.Remove(cell); + newObj["x0"] = Convert.ToSingle(obj["x0"]) + diff; + newObj["x1"] = Convert.ToSingle(obj["x1"]) + diff; } else { - // How many corners does this table share with the current group? - int cornerCount = cellCorners.Count(corner => currentCorners.Any(cc => cc.EqualTo(corner))); - - // If touching on at least one corner... - if (cornerCount > 0) - { - // ... assign it to the current group - currentCorners.UnionWith(cellCorners); - currentCells.Add(cell); - remainingCells.Remove(cell); - } + newObj["top"] = Convert.ToSingle(obj["top"]) + diff; + newObj["bottom"] = Convert.ToSingle(obj["bottom"]) + diff; + if (obj.ContainsKey("doctop")) + newObj["doctop"] = Convert.ToSingle(obj["doctop"]) + diff; + if (obj.ContainsKey("y0")) + newObj["y0"] = Convert.ToSingle(obj["y0"]) - diff; + if (obj.ContainsKey("y1")) + newObj["y1"] = Convert.ToSingle(obj["y1"]) - diff; } - } - // If this iteration did not find any more cells to append... - if (currentCells.Count == initialCellCount) - { - tables.Add(new List(currentCells)); - currentCorners.Clear(); - currentCells.Clear(); - } + return newObj; + }).ToList(); + snappedClusters.Add(snapped); } + + return snappedClusters.SelectMany(x => x).ToList(); + } - // Once we have exhausting the list of cells ... - // ... and we have a cell group that has not been stored - if (currentCells.Count > 0) + // snap_edges - Snap edges within tolerance + internal static List SnapEdges( + List edges, + float xTolerance = TableFlags.TABLE_DEFAULT_SNAP_TOLERANCE, + float yTolerance = TableFlags.TABLE_DEFAULT_SNAP_TOLERANCE) + { + var byOrientation = new Dictionary> { - tables.Add(new List(currentCells)); - } + { "v", new List() }, + { "h", new List() } + }; - // remove tables without text or having only 1 column - for (int i = tables.Count - 1; i >= 0; i--) - { - var r = new BBox(0, 0, 0, 0); // EMPTY_RECT placeholder - var x1Vals = new HashSet(); - var x0Vals = new HashSet(); + foreach (var e in edges) + byOrientation[e.orientation].Add(e); - foreach (var cell in tables[i]) - { - r |= cell; - x1Vals.Add(cell.x1); - x0Vals.Add(cell.x0); - } + var snappedV = SnapEdgesByOrientation(byOrientation["v"], "x0", xTolerance); + var snappedH = SnapEdgesByOrientation(byOrientation["h"], "top", yTolerance); + + return snappedV.Concat(snappedH).ToList(); + } + + private static List SnapEdgesByOrientation(List edges, string attr, float tolerance) + { + if (edges.Count == 0) return edges; + + var clusters = TableHelpers.ClusterObjects(edges, e => GetEdgeValue(e, attr), tolerance); + var avgs = clusters.Select(cluster => cluster.Average(e => GetEdgeValue(e, attr))).ToList(); - string rText = page.GetTextbox(new Rect(r.x0, r.top, r.x1, r.bottom)); - if (x1Vals.Count < 2 || x0Vals.Count < 2 || whiteSpaces_issuperset(rText)) + var result = new List(); + for (int i = 0; i < clusters.Count; i++) + { + float avg = avgs[i]; + foreach (var e in clusters[i]) { - tables.RemoveAt(i); + var snapped = new Edge + { + x0 = e.x0, + x1 = e.x1, + top = e.top, + bottom = e.bottom, + width = e.width, + height = e.height, + orientation = e.orientation, + object_type = e.object_type, + doctop = e.doctop, + page_number = e.page_number, + y0 = e.y0, + y1 = e.y1 + }; + + float diff = avg - GetEdgeValue(e, attr); + if (attr == "x0") + { + snapped.x0 = avg; + snapped.x1 = e.x1 + diff; + snapped.width = snapped.x1 - snapped.x0; + } + else if (attr == "top") + { + snapped.top = avg; + snapped.bottom = e.bottom + diff; + snapped.height = snapped.bottom - snapped.top; + snapped.doctop = e.doctop + diff; + } + + result.Add(snapped); } } - // Sort the tables top-to-bottom-left-to-right based on the value of the - // topmost-and-then-leftmost coordinate of a table. - var sortedTables = tables.OrderBy(t => t.Min(c => c.top)) - .ThenBy(t => t.Min(c => c.x0)) - .ToList(); - - return sortedTables; - } - - public static List extract_words(List chars, Dictionary kwargs) - { - // WordExtractor parameters - float x_tolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE; - float y_tolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE; - bool keep_blank_chars = false; - bool use_text_flow = false; - bool horizontal_ltr = true; - bool vertical_ttb = false; - List extra_attrs = null; - bool split_at_punctuation = false; - bool expand_ligatures = true; - - foreach (string key in kwargs.Keys) - { - switch (key) - { - case "x_tolerance": - x_tolerance = float.Parse(kwargs[key].ToString(), System.Globalization.CultureInfo.InvariantCulture); break; - case "y_tolerance": - y_tolerance = float.Parse(kwargs[key].ToString(), System.Globalization.CultureInfo.InvariantCulture); break; - case "keep_blank_chars": - keep_blank_chars = bool.Parse(kwargs[key].ToString()); break; - case "use_text_flow": - use_text_flow = bool.Parse(kwargs[key].ToString()); break; - case "horizontal_ltr": - horizontal_ltr = bool.Parse(kwargs[key].ToString()); break; - case "vertical_ttb": - vertical_ttb = bool.Parse(kwargs[key].ToString()); break; - case "extra_attrs": - extra_attrs = (List)kwargs[key]; break; - case "split_at_punctuation": - split_at_punctuation = bool.Parse(kwargs[key].ToString()); break; - case "expand_ligatures": - expand_ligatures = bool.Parse(kwargs[key].ToString()); break; - default: - break; - } - } - - WordExtractor extractor = new WordExtractor( - x_tolerance, - y_tolerance, - keep_blank_chars, - use_text_flow, - horizontal_ltr, - vertical_ttb, - extra_attrs, - split_at_punctuation, - expand_ligatures - ); - - return extractor.extract_words(chars); + return result; } - public static TextMap chars_to_textmap(List chars, Dictionary kwargs) + private static float GetEdgeValue(Edge e, string attr) { - // Add the presorted parameter - kwargs["presorted"] = true; + switch (attr) + { + case "x0": + return e.x0; + case "top": + return e.top; + default: + return 0; + } + } - // WordExtractor parameters - float x_tolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE; - float y_tolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE; - bool keep_blank_chars = false; - bool use_text_flow = false; - bool horizontal_ltr = true; - bool vertical_ttb = false; - List extra_attrs = null; - bool split_at_punctuation = false; - bool expand_ligatures = true; - - // WordMap parameters - bool layout = false; - float layout_width = 0f; - float layout_height = 0f; - int layout_width_chars = 0; - int layout_height_chars = 0; - float x_density = TableFlags.TABLE_DEFAULT_X_DENSITY; - float y_density = TableFlags.TABLE_DEFAULT_Y_DENSITY; - float x_shift = 0; - float y_shift = 0; - bool presorted = false; - - foreach (string key in kwargs.Keys) - { - switch (key) - { - case "x_tolerance": - x_tolerance = (float)kwargs[key]; break; - case "y_tolerance": - y_tolerance = (float)kwargs[key]; break; - case "keep_blank_chars": - keep_blank_chars = (bool)kwargs[key]; break; - case "use_text_flow": - use_text_flow = (bool)kwargs[key]; break; - case "horizontal_ltr": - horizontal_ltr = (bool)kwargs[key]; break; - case "vertical_ttb": - vertical_ttb = (bool)kwargs[key]; break; - case "extra_attrs": - extra_attrs = (List)kwargs[key]; break; - case "split_at_punctuation": - split_at_punctuation = (bool)kwargs[key]; break; - case "expand_ligatures": - expand_ligatures = (bool)kwargs[key]; break; - case "layout": - layout = (bool)kwargs[key]; break; - case "layout_width": - layout_width = (float)kwargs[key]; break; - case "layout_height": - layout_height = (float)kwargs[key]; break; - case "layout_width_chars": - layout_width_chars = (int)kwargs[key]; break; - case "layout_height_chars": - layout_height_chars = (int)kwargs[key]; break; - case "x_density": - x_density = (float)kwargs[key]; break; - case "y_density": - y_density = (float)kwargs[key]; break; - case "x_shift": - x_shift = (float)kwargs[key]; break; - case "y_shift": - y_shift = (float)kwargs[key]; break; - case "presorted": - presorted = (bool)kwargs[key]; break; - default: - break; - } - } - - WordExtractor extractor = new WordExtractor( - x_tolerance, - y_tolerance, - keep_blank_chars, - use_text_flow, - horizontal_ltr, - vertical_ttb, - extra_attrs, - split_at_punctuation, - expand_ligatures - ); + // resize_object - Resize an object by changing a key value + internal static Dictionary ResizeObject(Dictionary obj, string key, float value) + { + if (key != "x0" && key != "x1" && key != "top" && key != "bottom") + throw new ArgumentException("Key must be 'x0', 'x1', 'top', or 'bottom'"); - WordMap wordmap = extractor.extract_wordmap(chars); - - TextMap textmap = wordmap.to_textmap( - layout, - layout_width, - layout_height, - layout_width_chars, - layout_height_chars, - x_density, - y_density, - x_shift, - y_shift, - y_tolerance, - use_text_flow, - presorted, - expand_ligatures - ); + var newObj = new Dictionary(obj); + float oldValue = Convert.ToSingle(obj[key]); + float diff = value - oldValue; + + newObj[key] = value; - return textmap; - } - - public static string extract_text(List chars, Dictionary kwargs) - { - // WordExtractor parameters - float x_tolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE; - float y_tolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE; - bool keep_blank_chars = false; - bool use_text_flow = false; - bool horizontal_ltr = true; - bool vertical_ttb = false; - List extra_attrs = null; - bool split_at_punctuation = false; - bool expand_ligatures = true; - - // WordMap parameters - bool layout = false; - float layout_width = 0f; - float layout_height = 0f; - int layout_width_chars = 0; - int layout_height_chars = 0; - float x_density = TableFlags.TABLE_DEFAULT_X_DENSITY; - float y_density = TableFlags.TABLE_DEFAULT_Y_DENSITY; - float x_shift = 0; - float y_shift = 0; - bool presorted = false; - - foreach (string key in kwargs.Keys) - { - switch (key) - { - case "x_tolerance": - x_tolerance = (float)kwargs[key]; break; - case "y_tolerance": - y_tolerance = (float)kwargs[key]; break; - case "keep_blank_chars": - keep_blank_chars = (bool)kwargs[key]; break; - case "use_text_flow": - use_text_flow = (bool)kwargs[key]; break; - case "horizontal_ltr": - horizontal_ltr = (bool)kwargs[key]; break; - case "vertical_ttb": - vertical_ttb = (bool)kwargs[key]; break; - case "extra_attrs": - extra_attrs = (List)kwargs[key]; break; - case "split_at_punctuation": - split_at_punctuation = (bool)kwargs[key]; break; - case "expand_ligatures": - expand_ligatures = (bool)kwargs[key]; break; - case "layout": - layout = (bool)kwargs[key]; break; - case "layout_width": - layout_width = (float)kwargs[key]; break; - case "layout_height": - layout_height = (float)kwargs[key]; break; - case "layout_width_chars": - layout_width_chars = (int)kwargs[key]; break; - case "layout_height_chars": - layout_height_chars = (int)kwargs[key]; break; - case "x_density": - x_density = (float)kwargs[key]; break; - case "y_density": - y_density = (float)kwargs[key]; break; - case "x_shift": - x_shift = (float)kwargs[key]; break; - case "y_shift": - y_shift = (float)kwargs[key]; break; - case "presorted": - presorted = (bool)kwargs[key]; break; - default: - break; - } - } - - if (chars.Count == 0) + if (key == "x0") { - return ""; + if (value > Convert.ToSingle(obj["x1"])) + throw new ArgumentException("x0 must be <= x1"); + newObj["width"] = Convert.ToSingle(obj["x1"]) - value; } + else if (key == "x1") + { + if (value < Convert.ToSingle(obj["x0"])) + throw new ArgumentException("x1 must be >= x0"); + newObj["width"] = value - Convert.ToSingle(obj["x0"]); + } + else if (key == "top") + { + if (value > Convert.ToSingle(obj["bottom"])) + throw new ArgumentException("top must be <= bottom"); + newObj["doctop"] = Convert.ToSingle(obj["doctop"]) + diff; + newObj["height"] = Convert.ToSingle(obj["height"]) - diff; + if (obj.ContainsKey("y1")) + newObj["y1"] = Convert.ToSingle(obj["y1"]) - diff; + } + else if (key == "bottom") + { + if (value < Convert.ToSingle(obj["top"])) + throw new ArgumentException("bottom must be >= top"); + newObj["height"] = Convert.ToSingle(obj["height"]) + diff; + if (obj.ContainsKey("y0")) + newObj["y0"] = Convert.ToSingle(obj["y0"]) - diff; + } + + return newObj; + } - // Layout handling - if (layout == true) + // join_edge_group - Join edges along the same line + internal static List JoinEdgeGroup_( + List edges, + string orientation, + float tolerance = TableFlags.TABLE_DEFAULT_JOIN_TOLERANCE) + { + string minProp, maxProp; + if (orientation == "h") + { + minProp = "x0"; + maxProp = "x1"; + } + else if (orientation == "v") { - return chars_to_textmap(chars, kwargs).AsString; + minProp = "top"; + maxProp = "bottom"; } else { - WordExtractor extractor = new WordExtractor( - x_tolerance, - y_tolerance, - keep_blank_chars, - use_text_flow, - horizontal_ltr, - vertical_ttb, - extra_attrs, - split_at_punctuation, - expand_ligatures - ); + throw new ArgumentException("Orientation must be 'v' or 'h'"); + } - // Extract words using WordExtractor - List words = extractor.extract_words(chars); - // rotation cannot change within a cell - int rotation = words.Count > 0 ? (int)words[0].rotation : 0; + var sortedEdges = edges.OrderBy(e => GetEdgeValue(e, minProp)).ToList(); + if (sortedEdges.Count == 0) return new List(); - string lines; + var joined = new List { sortedEdges[0] }; - if (rotation == 90) - { - // Sort for rotation 90 - words = words.OrderBy(w => w.x1).ThenByDescending(w => w.top).ToList(); - lines = string.Join(" ", words.Select(w => w.text.ToString())); - } - else if (rotation == 270) + for (int i = 1; i < sortedEdges.Count; i++) + { + var e = sortedEdges[i]; + var last = joined[joined.Count - 1]; + + float eMin = GetEdgeValue(e, minProp); + float lastMax = GetEdgeValue(last, maxProp); + + if (eMin <= (lastMax + tolerance)) { - // Sort for rotation 270 - words = words.OrderByDescending(w => w.x1).ThenBy(w => w.top).ToList(); - lines = string.Join(" ", words.Select(w => w.text.ToString())); + float eMax = GetEdgeValue(e, maxProp); + if (eMax > lastMax) + { + // Extend current edge + var extended = new Edge + { + x0 = last.x0, + x1 = last.x1, + top = last.top, + bottom = last.bottom, + width = last.width, + height = last.height, + orientation = last.orientation, + object_type = last.object_type, + doctop = last.doctop, + page_number = last.page_number, + y0 = last.y0, + y1 = last.y1 + }; + + if (orientation == "h") + { + extended.x1 = e.x1; + extended.width = extended.x1 - extended.x0; + } + else + { + extended.bottom = e.bottom; + extended.height = extended.bottom - extended.top; + } + + joined[joined.Count - 1] = extended; + } } else { - // Cluster words based on doctop - var linesGrouped = cluster_objects(words, obj=>obj.doctop, y_tolerance); - lines = string.Join("\n", linesGrouped.Select(line => string.Join(" ", line.Select(w => w.text)))); + joined.Add(e); + } + } + + return joined; + } + + internal static List JoinEdgeGroup( + List edges, + string orientation, + float tolerance) + { + Func minProp; + Func maxProp; + Action setMaxProp; + + // Select properties based on orientation + if (orientation == "h") + { + minProp = e => e.x0; + maxProp = e => e.x1; + setMaxProp = (e, v) => e.x1 = v; + } + else if (orientation == "v") + { + minProp = e => e.top; + maxProp = e => e.bottom; + setMaxProp = (e, v) => e.bottom = v; + } + else + { + throw new ArgumentException("Orientation must be 'h' or 'v'"); + } - if (rotation == 180) + if (edges == null || edges.Count == 0) + return new List(); + + // Sort edges by their minimum extent + var sortedEdges = edges + .OrderBy(minProp) + .ToList(); + + var joined = new List { sortedEdges[0] }; + + // Merge overlapping / nearby edges + for (int i = 1; i < sortedEdges.Count; i++) + { + var current = sortedEdges[i]; + var last = joined[joined.Count - 1]; + + if (minProp(current) <= maxProp(last) + tolerance) + { + // Extend the last edge if needed + if (maxProp(current) > maxProp(last)) { - // Special handling for rotation 180 (reverse lines and replace newline with spaces) - lines = new string(lines.Reverse().Select(c => c == '\n' ? ' ' : c).ToArray()); + setMaxProp(last, maxProp(current)); } } - - return lines; + else + { + // Separate edge → start a new segment + joined.Add(current); + } } - } - } - public class TextItem - { - public string Text { get; set; } - public object Obj { get; set; } + return joined; + } - public TextItem(string text, object obj) + // merge_edges - Merge edges using snap and join + internal static List MergeEdges_( + List edges, + float snapXTolerance, + float snapYTolerance, + float joinXTolerance, + float joinYTolerance) { - Text = text; - Obj = obj; - } - } + if (snapXTolerance > 0 || snapYTolerance > 0) + edges = SnapEdges(edges, snapXTolerance, snapYTolerance); - public class TextMap - { - public List Tuples { get; set; } - public string AsString { get; set; } + // Use Tuple for grouping key (matching Python's get_group function) + var sorted = edges.OrderBy(e => Tuple.Create(e.orientation, e.orientation == "h" ? e.top : e.x0)).ToList(); + var edgeGroups = sorted.GroupBy(e => Tuple.Create(e.orientation, e.orientation == "h" ? e.top : e.x0)); - public TextMap(List tuples = null) + var merged = new List(); + foreach (var group in edgeGroups) + { + string orientation = group.Key.Item1; // First element of tuple is orientation + float tolerance = orientation == "h" ? joinXTolerance : joinYTolerance; + merged.AddRange(JoinEdgeGroup(group.ToList(), orientation, tolerance)); + } + + return merged; + } + + public static List MergeEdges( + List edges, + float snapXTolerance, + float snapYTolerance, + float joinXTolerance, + float joinYTolerance) { - Tuples = tuples ?? new List(); - AsString = string.Join("", Tuples.Select(item => item.Text)); + // Local grouping key (equivalent to Python get_group) + (string, float) GetGroupKey(Edge edge) + { + return edge.orientation == "h" + ? ("h", edge.top) + : ("v", edge.x0); + } + + // Optional snapping + if (snapXTolerance > 0 || snapYTolerance > 0) + { + edges = SnapEdges(edges, snapXTolerance, snapYTolerance); + } + + // Sort by group key + var sortedEdges = edges + .OrderBy(e => GetGroupKey(e).Item1) + .ThenBy(e => GetGroupKey(e).Item2) + .ToList(); + + // Group edges + var groupedEdges = sortedEdges + .GroupBy(GetGroupKey); + + // Join edge groups + var mergedEdges = new List(); + + foreach (var group in groupedEdges) + { + string orientation = group.Key.Item1; + float joinTolerance = + orientation == "h" ? joinXTolerance : joinYTolerance; + + var joined = JoinEdgeGroup( + group.ToList(), + orientation, + joinTolerance + ); + + mergedEdges.AddRange(joined); + } + + return mergedEdges; } - } - public class WordMap - { - public List>> Tuples { get; set; } + // bbox_to_rect - Convert bbox tuple to rect dict + internal static Dictionary BboxToRect(Tuple bbox) + { + return new Dictionary + { + { "x0", bbox.Item1 }, + { "top", bbox.Item2 }, + { "x1", bbox.Item3 }, + { "bottom", bbox.Item4 } + }; + } - public WordMap(List>> tuples) + // objects_to_rect - Get smallest rect containing objects + internal static Dictionary ObjectsToRect(IEnumerable objects) { - Tuples = tuples; + var bbox = TableHelpers.ObjectsToBbox(objects); + return BboxToRect(Tuple.Create(bbox.X0, bbox.Y0, bbox.X1, bbox.Y1)); } - public TextMap to_textmap( - bool layout = false, - float layoutWidth = 0, - float layoutHeight = 0, - int layoutWidthChars = 0, - int layoutHeightChars = 0, - float xDensity = TableFlags.TABLE_DEFAULT_X_DENSITY, - float yDensity = TableFlags.TABLE_DEFAULT_Y_DENSITY, - float xShift = 0, - float yShift = 0, - float yTolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE, - bool useTextFlow = false, - bool presorted = false, - bool expandLigatures = true - ) + // merge_bboxes - Merge multiple bboxes + internal static Tuple MergeBboxes(IEnumerable> bboxes) { - var textMap = new List(); + var bboxList = bboxes.ToList(); + if (bboxList.Count == 0) + return Tuple.Create(0f, 0f, 0f, 0f); - if (Tuples.Count == 0) - return new TextMap(textMap); + return Tuple.Create( + bboxList.Min(b => b.Item1), + bboxList.Min(b => b.Item2), + bboxList.Max(b => b.Item3), + bboxList.Max(b => b.Item4) + ); + } - var expansions = expandLigatures ? TableFlags.TABLE_LIGATURES : new Dictionary(); + // words_to_edges_h - Find horizontal edges from words + internal static List WordsToEdgesH( + List> words, + int wordThreshold = (int)TableFlags.TABLE_DEFAULT_MIN_WORDS_HORIZONTAL) + { + var byTop = TableHelpers.ClusterObjects(words, w => Convert.ToSingle(w["top"]), 1); + var largeClusters = byTop.Where(x => x.Count >= wordThreshold).ToList(); + + if (largeClusters.Count == 0) + return new List(); - // Layout handling - if (layout) + var rects = largeClusters.Select(cluster => ObjectsToRect(cluster.Cast())).ToList(); + float minX0 = rects.Min(r => Convert.ToSingle(r["x0"])); + float maxX1 = rects.Max(r => Convert.ToSingle(r["x1"])); + + var edges = new List(); + foreach (var r in rects) { - if (layoutWidthChars > 0) - { - if (layoutWidth > 0) - { - throw new ArgumentException("`layoutWidth` and `layoutWidthChars` cannot both be set."); - } - } - else - { - layoutWidthChars = (int)Math.Round(layoutWidth / xDensity); - } + float top = Convert.ToSingle(r["top"]); + float bottom = Convert.ToSingle(r["bottom"]); - if (layoutHeightChars > 0) + // Top edge + edges.Add(new Edge { - if (layoutHeight > 0) - { - throw new ArgumentException("`layoutHeight` and `layoutHeightChars` cannot both be set."); - } - } - else + x0 = minX0, + x1 = maxX1, + top = top, + bottom = top, + width = maxX1 - minX0, + height = 0, + orientation = "h", + object_type = "text_edge" + }); + + // Bottom edge + edges.Add(new Edge { - layoutHeightChars = (int)Math.Round(layoutHeight / yDensity); - } + x0 = minX0, + x1 = maxX1, + top = bottom, + bottom = bottom, + width = maxX1 - minX0, + height = 0, + orientation = "h", + object_type = "text_edge" + }); } - int numNewlines = 0; - var wordsSortedDoctop = presorted || useTextFlow - ? Tuples - : Tuples.OrderBy(t => t.Item1.doctop).ToList(); + return edges; + } - Character firstWord = wordsSortedDoctop[0].Item1; - float doctopStart = firstWord.doctop - firstWord.top; + // get_bbox_overlap - Get overlap between two bboxes + internal static Tuple GetBboxOverlap( + Tuple a, + Tuple b) + { + float oLeft = Math.Max(a.Item1, b.Item1); + float oRight = Math.Min(a.Item3, b.Item3); + float oBottom = Math.Min(a.Item4, b.Item4); + float oTop = Math.Max(a.Item2, b.Item2); + float oWidth = oRight - oLeft; + float oHeight = oBottom - oTop; + + if (oHeight >= 0 && oWidth >= 0 && oHeight + oWidth > 0) + return Tuple.Create(oLeft, oTop, oRight, oBottom); + + return null; + } + + // words_to_edges_v - Find vertical edges from words + internal static List WordsToEdgesV( + List> words, + int wordThreshold = (int)TableFlags.TABLE_DEFAULT_MIN_WORDS_VERTICAL) + { + var byX0 = TableHelpers.ClusterObjects(words, w => Convert.ToSingle(w["x0"]), 1); + var byX1 = TableHelpers.ClusterObjects(words, w => Convert.ToSingle(w["x1"]), 1); + + Func, float> getCenter = w => + (Convert.ToSingle(w["x0"]) + Convert.ToSingle(w["x1"])) / 2; + var byCenter = TableHelpers.ClusterObjects(words, getCenter, 1); + + var clusters = byX0.Concat(byX1).Concat(byCenter).ToList(); + var sortedClusters = clusters.OrderByDescending(x => x.Count).ToList(); + var largeClusters = sortedClusters.Where(x => x.Count >= wordThreshold).ToList(); - int k = 0; - foreach (var ws in cluster_objects(wordsSortedDoctop, t => t.Item1.doctop, yTolerance)) + if (largeClusters.Count == 0) + return new List(); + + var bboxes = largeClusters.Select(cluster => { - float yDist = layout - ? (ws[0].Item1.doctop - (doctopStart + yShift)) / yDensity - : 0; + var rect = ObjectsToRect(cluster.Cast()); + return Tuple.Create( + Convert.ToSingle(rect["x0"]), + Convert.ToSingle(rect["top"]), + Convert.ToSingle(rect["x1"]), + Convert.ToSingle(rect["bottom"]) + ); + }).ToList(); - int numNewlinesPrepend = Math.Max(k > 0 ? 1 : 0, (int)Math.Round(yDist) - numNewlines); - k++; - for (int i = 0; i < numNewlinesPrepend; i++) - { - if (textMap.Count == 0 || textMap.Last().Text == "\n") - { - textMap.Add(new TextItem(" ", null)); // Blank line handling - } - textMap.Add(new TextItem("\n", null)); // Add newline - } - numNewlines += numNewlinesPrepend; + var condensedBboxes = new List>(); + foreach (var bbox in bboxes) + { + bool hasOverlap = condensedBboxes.Any(c => GetBboxOverlap(bbox, c) != null); + if (!hasOverlap) + condensedBboxes.Add(bbox); + } - float lineLen = 0; + if (condensedBboxes.Count == 0) + return new List(); - var lineWordsSortedX0 = presorted || useTextFlow - ? ws - : ws.OrderBy(w => w.Item1.x0).ToList(); + var condensedRects = condensedBboxes.Select(bbox => BboxToRect(bbox)) + .OrderBy(r => Convert.ToSingle(r["x0"])).ToList(); + + float maxX1 = condensedRects.Max(r => Convert.ToSingle(r["x1"])); + float minTop = condensedRects.Min(r => Convert.ToSingle(r["top"])); + float maxBottom = condensedRects.Max(r => Convert.ToSingle(r["bottom"])); - foreach (var word in lineWordsSortedX0) + var edges = new List(); + foreach (var r in condensedRects) + { + edges.Add(new Edge { - var wordObj = word.Item1; - float xDist = layout ? (wordObj.x0 - xShift) / xDensity : 0; - int numSpacesPrepend = Math.Max(Math.Min(1, (int)lineLen), (int)Math.Round(xDist) - (int)lineLen); - for (int i = 0; i < numSpacesPrepend; i++) - { - textMap.Add(new TextItem(" ", null)); // Add spaces before the word - } - lineLen += numSpacesPrepend; + x0 = Convert.ToSingle(r["x0"]), + x1 = Convert.ToSingle(r["x0"]), + top = minTop, + bottom = maxBottom, + width = 0, + height = maxBottom - minTop, + orientation = "v", + object_type = "text_edge" + }); + } - foreach (Character c in word.Item2) - { - string letters = expansions.ContainsKey(c.text) ? expansions[c.text] : c.text; - foreach (var letter in letters) - { - textMap.Add(new TextItem(letter.ToString(), c)); // Add each letter - lineLen += 1; - } - } - } + // Add rightmost edge + edges.Add(new Edge + { + x0 = maxX1, + x1 = maxX1, + top = minTop, + bottom = maxBottom, + width = 0, + height = maxBottom - minTop, + orientation = "v", + object_type = "text_edge" + }); - // Add spaces at the end of the line if layout - if (layout) + return edges; + } + + // edges_to_intersections - Find intersection points of edges + internal static Dictionary, Dictionary>> EdgesToIntersections( + List edges, + float xTolerance = 1, + float yTolerance = 1) + { + var intersections = new Dictionary, Dictionary>>(); + var vEdges = edges.Where(e => e.orientation == "v") + .OrderBy(e => e.x0).ThenBy(e => e.top).ToList(); + var hEdges = edges.Where(e => e.orientation == "h") + .OrderBy(e => e.top).ThenBy(e => e.x0).ToList(); + + foreach (var v in vEdges) + { + foreach (var h in hEdges) { - for (int i = 0; i < (layoutWidthChars - (int)lineLen); i++) + if ((v.top <= (h.top + yTolerance)) && + (v.bottom >= (h.top - yTolerance)) && + (v.x0 >= (h.x0 - xTolerance)) && + (v.x0 <= (h.x1 + xTolerance))) { - textMap.Add(new TextItem(" ", null)); + var vertex = Tuple.Create(v.x0, h.top); + if (!intersections.ContainsKey(vertex)) + { + intersections[vertex] = new Dictionary> + { + { "v", new List() }, + { "h", new List() } + }; + } + intersections[vertex]["v"].Add(v); + intersections[vertex]["h"].Add(h); } } } - // Append blank lines at the end of text - if (layout) + return intersections; + } + + // intersections_to_cells - Convert intersections to cells + internal static List IntersectionsToCells_( + Dictionary, Dictionary>> intersections) + { + var cells = new List(); + var points = intersections.Keys.OrderBy(p => p.Item2).ThenBy(p => p.Item1).ToList(); + int nPoints = points.Count; + + Func, Tuple, bool> edgeConnects = (p1, p2) => { - int numNewlinesAppend = layoutHeightChars - (numNewlines + 1); - for (int i = 0; i < numNewlinesAppend; i++) + Func, HashSet>> edgesToSet = edges => + { + return new HashSet>(edges.Select(e => + Tuple.Create(e.x0, e.top, e.x1, e.bottom))); + }; + + if (p1.Item1 == p2.Item1) // Same x { - if (i > 0) - { - textMap.Add(new TextItem(" ", null)); // Blank line at the end - } - textMap.Add(new TextItem("\n", null)); // Add newline + var common = new HashSet>(edgesToSet(intersections[p1]["v"])); + common.IntersectWith(edgesToSet(intersections[p2]["v"])); + if (common.Count > 0) + return true; } - // Remove the last newline if present - if (textMap.Last().Text == "\n") + if (p1.Item2 == p2.Item2) // Same y { - textMap.RemoveAt(textMap.Count - 1); + var common = new HashSet>(edgesToSet(intersections[p1]["h"])); + common.IntersectWith(edgesToSet(intersections[p2]["h"])); + if (common.Count > 0) + return true; } - } - - return new TextMap(textMap); - } - } - public class WordExtractor - { - public float xTolerance; - public float yTolerance; - public bool keepBlankChars; - public bool useTextFlow; - public bool horizontalLtr; // Should words be read left-to-right? - public bool verticalTtb; // Should vertical words be read top-to-bottom? - public List extraAttrs; - public string splitAtPunctuation; - public Dictionary expansions; + return false; + }; - public WordExtractor( - float xTolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE, - float yTolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE, - bool keepBlankChars = false, - bool useTextFlow = false, - bool horizontalLtr = true, - bool verticalTtb = false, - List extraAttrs = null, - bool splitAtPunctuation = false, - bool expandLigatures = true - ) - { - this.xTolerance = xTolerance; - this.yTolerance = yTolerance; - this.keepBlankChars = keepBlankChars; - this.useTextFlow = useTextFlow; - this.horizontalLtr = horizontalLtr; - this.verticalTtb = verticalTtb; - this.extraAttrs = extraAttrs ?? new List(); - this.splitAtPunctuation = splitAtPunctuation ? string.Join("", new[] { '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~' }) : ""; - this.expansions = expandLigatures ? TableFlags.TABLE_LIGATURES : new Dictionary(); - } + for (int i = 0; i < nPoints - 1; i++) + { + var pt = points[i]; + var rest = points.Skip(i + 1).ToList(); - public Character merge_chars(List orderedChars) - { - float x0, top, x1, bottom; - BBox bbox = objects_to_bbox(orderedChars); - x0 = bbox.x0; top = bbox.top; x1 = bbox.x1; bottom = bbox.bottom; - float doctopAdj = orderedChars[0].doctop - orderedChars[0].top; - bool upright = orderedChars[0].upright; - int direction = (this.horizontalLtr ? 1 : -1) * (upright ? 1 : -1); + var below = rest.Where(x => x.Item1 == pt.Item1).ToList(); + var right = rest.Where(x => x.Item2 == pt.Item2).ToList(); - Matrix matrix = orderedChars[0].matrix; + foreach (var belowPt in below) + { + if (!edgeConnects(pt, belowPt)) + continue; - int rotation = 0; - if (!upright && matrix[1] < 0) - { - orderedChars.Reverse(); - rotation = 270; - } - else if (matrix[0] < 0 && matrix[3] < 0) - { - rotation = 180; - } - else if (matrix[1] > 0) - { - rotation = 90; - } + foreach (var rightPt in right) + { + if (!edgeConnects(pt, rightPt)) + continue; - var word = new Character { - text = string.Join("", orderedChars.Select(c => expansions.ContainsKey(c.text) ? expansions[c.text] : c.text)), - x0 = x0, - x1 = x1, - top = top, - doctop = top + doctopAdj, - bottom = bottom, - upright = upright, - direction = direction, - rotation = rotation - }; + var bottomRight = Tuple.Create(rightPt.Item1, belowPt.Item2); - foreach (var key in this.extraAttrs) - { - var val = orderedChars[0].GetType().GetProperty(key).GetValue(orderedChars[0]); - word.GetType().GetProperty(key).SetValue(word, val); + if (intersections.ContainsKey(bottomRight) && + edgeConnects(bottomRight, rightPt) && + edgeConnects(bottomRight, belowPt)) + { + cells.Add(new Rect(pt.Item1, pt.Item2, bottomRight.Item1, bottomRight.Item2)); + } + } + } } - return word; + return cells; } - // This method takes several factors into account to determine if - // `curr_char` represents the beginning of a new word: - // - Whether the text is "upright" (i.e., non-rotated) - // - Whether the user has specified that horizontal text runs - // left-to-right(default) or right-to-left, as represented by - // self.horizontal_ltr - // - Whether the user has specified that vertical text the text runs - // top-to-bottom(default) or bottom-to-top, as represented by - // self.vertical_ttb - // - The x0, top, x1, and bottom attributes of prev_char and - // curr_char - // - The self.x_tolerance and self.y_tolerance settings. Note: In - // this case, x/y refer to those directions for non-rotated text. - // For vertical text, they are flipped.A more accurate terminology - // might be "*intra*line character distance tolerance" and - // "*inter*line character distance tolerance" - // An important note: The* intra*line distance is measured from the - // * end* of the previous character to the *beginning* of the current - // character, while the* inter*line distance is measured from the - // * top* of the previous character to the *top* of the next - // character.The reasons for this are partly repository-historical, - // and partly logical, as successive text lines' bounding boxes often - // overlap slightly (and we don't want that overlap to be interpreted - // as the two lines being the same line). - // The upright-ness of the character determines the attributes to - // compare, while horizontal_ltr/vertical_ttb determine the direction - // of the comparison. - public bool char_begins_new_word(Character prevChar, Character currChar) - { - float x, y, ay, cy, ax, bx, cx; - - // Note: Due to the grouping step earlier in the process, - // curr_char["upright"] will always equal prev_char["upright"]. - if (currChar.upright == true) + internal static List IntersectionsToCells( + Dictionary, Dictionary>> intersections) + { + // ---------- edge_connects ---------- + bool EdgeConnects( + Tuple p1, + Tuple p2) { - x = this.xTolerance; - y = this.yTolerance; - ay = prevChar.top; - cy = currChar.top; - if (horizontalLtr == true) - { - ax = prevChar.x0; - bx = prevChar.x1; - cx = currChar.x0; - } - else + HashSet<(float, float, float, float)> EdgesToSet(List edges) { - ax = -prevChar.x1; - bx = -prevChar.x0; - cx = -currChar.x1; + var set = new HashSet<(float, float, float, float)>(); + foreach (var e in edges) + set.Add(ObjToBBox(e)); + return set; } - } - else - { - x = this.yTolerance; - y = this.xTolerance; - ay = prevChar.x0; - cy = currChar.x0; - if (verticalTtb == true) + + // Same X → vertical edges + if (p1.Item1 == p2.Item1) { - ax = prevChar.top; - bx = prevChar.bottom; - cx = currChar.top; + var common = EdgesToSet(intersections[p1]["v"]) + .Intersect(EdgesToSet(intersections[p2]["v"])); + + if (common.Any()) + return true; } - else + + // Same Y → horizontal edges + if (p1.Item2 == p2.Item2) { - ax = -prevChar.bottom; - bx = -prevChar.top; - cx = -currChar.bottom; + var common = EdgesToSet(intersections[p1]["h"]) + .Intersect(EdgesToSet(intersections[p2]["h"])); + + if (common.Any()) + return true; } + + return false; } - return (cx < ax) || (cx > bx + x) || (cy > ay + y); - } + var points = intersections.Keys + .OrderBy(p => p.Item1) + .ThenBy(p => p.Item2) + .ToList(); - public IEnumerable> iter_chars_to_words(List orderedChars) - { - List currentWord = new List(); + int nPoints = points.Count; - foreach (var charDict in orderedChars) + // ---------- find_smallest_cell ---------- + Rect FindSmallestCell(int i) { - string text = charDict.text; + if (i == nPoints - 1) + return null; - // If keep_blank_chars is false and the char is a space, we start the next word - if (!this.keepBlankChars && string.IsNullOrWhiteSpace(text)) - { - yield return currentWord; // Yield the current word - currentWord.Clear(); - } + var pt = points[i]; + var rest = points.Skip(i + 1); - // If text is a punctuation mark, split the word - else if (this.splitAtPunctuation.Contains(text)) - { - yield return currentWord; // Yield the current word - currentWord.Clear(); - currentWord.Add(charDict); // Add punctuation as a new word - yield return currentWord; // Yield the punctuation as a word - currentWord.Clear(); - } - // Check if this character begins a new word - else if (currentWord.Count > 0 && char_begins_new_word(currentWord[currentWord.Count - 1], charDict)) - { - yield return currentWord; // Yield the current word - currentWord.Clear(); - currentWord.Add(charDict); // Start a new word with this char - } - else + var below = rest.Where(p => p.Item1 == pt.Item1).ToList(); + var right = rest.Where(p => p.Item2 == pt.Item2).ToList(); + + foreach (var belowPt in below) { - currentWord.Add(charDict); // Otherwise, just add the character to the current word + if (!EdgeConnects(pt, belowPt)) + continue; + + foreach (var rightPt in right) + { + if (!EdgeConnects(pt, rightPt)) + continue; + + var bottomRight = Tuple.Create(rightPt.Item1, belowPt.Item2); + + if (intersections.ContainsKey(bottomRight) && + EdgeConnects(bottomRight, rightPt) && + EdgeConnects(bottomRight, belowPt)) + { + float x0 = pt.Item1; + float y0 = pt.Item2; + float x1 = bottomRight.Item1; + float y1 = bottomRight.Item2; + + return new Rect( + x0, + y0, + x1, + y1 + ); + } + } } + + return null; } - // Yield the last word if it exists - if (currentWord.Count > 0) + // ---------- generate cells ---------- + var cells = new List(); + + for (int i = 0; i < points.Count; i++) { - yield return currentWord; + var cell = FindSmallestCell(i); + if (cell != null) + cells.Add(cell); } + + return cells; } - public IEnumerable iter_sort_chars(List chars) + // ---------- obj_to_bbox ---------- + private static (float, float, float, float) ObjToBBox(Edge e) { - Func upright_key = x => -Convert.ToInt32(x.upright); - - // Sort characters based on "upright" - var uprightClusters = chars - .GroupBy(x => x.upright) - .OrderByDescending(g => g.Key) // Group by "upright" key (1 for upright, 0 for non-upright) - .ToList(); + return (e.x0, e.top, e.x1, e.bottom); + } - foreach (var uprightCluster in uprightClusters) + // cells_to_tables - Group cells into tables + internal static List> CellsToTables(Page page, List cells) + { + Func>> bboxToCorners = bbox => { - bool upright = uprightCluster.Key; - string clusterKey = upright ? "doctop" : "x0"; // Define clustering key based on upright status + return new List> + { + Tuple.Create(bbox.X0, bbox.Y0), + Tuple.Create(bbox.X0, bbox.Y1), + Tuple.Create(bbox.X1, bbox.Y0), + Tuple.Create(bbox.X1, bbox.Y1) + }; + }; + + var remainingCells = new List(cells); + var currentCorners = new HashSet>(); + var currentCells = new List(); + var tables = new List>(); - // Cluster by line using "doctop" for upright or "x0" for non-upright characters - var subclusters = uprightCluster - .GroupBy(c => upright ? c.doctop : c.x0) - .OrderBy(g => g.Key) - .ToList(); + while (remainingCells.Count > 0) + { + int initialCellCount = currentCells.Count; + var cellsToRemove = new List(); - foreach (var subcluster in subclusters) + foreach (var cell in remainingCells) { - // Sort within each subcluster - var sortedChars = subcluster.OrderBy(c => upright ? c.x0 : c.doctop).ToList(); + var cellCorners = bboxToCorners(cell); - // Reverse order if necessary - if (!(horizontalLtr && upright || verticalTtb && !upright)) + if (currentCells.Count == 0) { - sortedChars.Reverse(); + foreach (var corner in cellCorners) + currentCorners.Add(corner); + currentCells.Add(cell); + cellsToRemove.Add(cell); } - - // Yield the sorted characters - foreach (var character in sortedChars) + else { - yield return character; + int cornerCount = cellCorners.Count(c => currentCorners.Contains(c)); + if (cornerCount > 0) + { + foreach (var corner in cellCorners) + currentCorners.Add(corner); + currentCells.Add(cell); + cellsToRemove.Add(cell); + } } } - } - } - public IEnumerable>> iter_extract_tuples(List chars) - { - // Sort characters if necessary - var orderedChars = useTextFlow ? chars : iter_sort_chars(chars).ToList(); + foreach (var cell in cellsToRemove) + remainingCells.Remove(cell); - // Group characters by "Upright" and any extra attributes - var groupedChars = orderedChars - .GroupBy(c => new { c.upright, ExtraAttrs = string.Join(",", extraAttrs.Select(attr => attr)) }) - .ToList(); + if (currentCells.Count == initialCellCount) + { + tables.Add(new List(currentCells)); + currentCorners.Clear(); + currentCells.Clear(); + } + } - foreach (var group in groupedChars) + if (currentCells.Count > 0) + tables.Add(currentCells); + + // MuPDF modification: Remove tables without text or having only 1 column + for (int i = tables.Count - 1; i >= 0; i--) { - var charGroup = group.ToList(); // All characters in this group + var table = tables[i]; + var r = new Rect(0, 0, 0, 0); + var x1Vals = new HashSet(); + var x0Vals = new HashSet(); + + foreach (var c in table) + { + r = r | c; + x1Vals.Add(c.X1); + x0Vals.Add(c.X0); + } + + if (x1Vals.Count < 2 || x0Vals.Count < 2) + { + tables.RemoveAt(i); + continue; + } - // Assuming we have a method to split characters into words - foreach (var wordChars in iter_chars_to_words(charGroup)) + // Check if table has only whitespace + try { - // Yield the word (merged characters and the list of characters) - if (wordChars.Count > 0) + var textpage = TableGlobals.TEXTPAGE ?? page.GetTextPage(); + string text = textpage.ExtractTextBox(r.ToFzRect()); + if (string.IsNullOrWhiteSpace(text)) { - yield return new Tuple>(merge_chars(wordChars), wordChars); + tables.RemoveAt(i); + continue; } } + catch + { + // If text extraction fails, keep the table + } } - } - - public WordMap extract_wordmap(List chars) - { - // Convert the result of IterExtractTuples into a list of tuples and return a WordMap - return new WordMap(iter_extract_tuples(chars).ToList()); - } - public List extract_words(List chars) - { - // Extract words by iterating over the tuples and selecting the first item (the word) - var words = iter_extract_tuples(chars) - .Select(tuple => tuple.Item1) // Select the word (first item in the tuple) - .ToList(); + // Sort tables top-to-bottom-left-to-right + tables = tables.OrderBy(t => t.Min(c => Tuple.Create(c.Y0, c.X0))).ToList(); - return words; + return tables; } } - + + // CellGroup base class public class CellGroup { - public List Cells { get; set; } // List of tuples representing the bounding boxes - - public BBox Bbox { get; set; } + public List cells { get; set; } + public Rect bbox { get; set; } - public CellGroup(List cells) + public CellGroup(List cells) { - Cells = cells; - - // Filter out null cells and then calculate the bounding box (bbox) - var filteredCells = cells.Where(cell => cell != null).ToList(); - - // Calculate the bounding box using LINQ (equivalent to min/max in Python) - Bbox = new BBox( - filteredCells.Min(cell => cell.x0), // min x0 - filteredCells.Min(cell => cell.top), // min top - filteredCells.Max(cell => cell.x1), // max x1 - filteredCells.Max(cell => cell.bottom) // max bottom - ); + this.cells = cells; + if (cells != null && cells.Count > 0) + { + var validCells = cells.Where(c => c != null).ToList(); + if (validCells.Count > 0) + { + bbox = new Rect( + validCells.Min(c => c.X0), + validCells.Min(c => c.Y0), + validCells.Max(c => c.X1), + validCells.Max(c => c.Y1) + ); + } + } } } + // TableRow class public class TableRow : CellGroup { - // Inherits everything from CellGroup and does not add any new behavior yet. - public TableRow(List cells) : base(cells) + public TableRow(List cells) : base(cells) { } } + // TableHeader class public class TableHeader { - // Properties to hold the bounding box, cells, names, and above (external) - public BBox Bbox { get; set; } - public List Cells { get; set; } - public List Names { get; set; } - public bool External { get; set; } // Use 'object' if 'above' can be of different types + public Rect bbox { get; set; } + public List cells { get; set; } + public List names { get; set; } + public bool external { get; set; } - // Constructor - public TableHeader(BBox bbox, List cells, List names, bool above) + public TableHeader(Rect bbox, List cells, List names, bool external) { - Bbox = bbox; - Cells = cells; - Names = names; - External = above; + this.bbox = bbox; + this.cells = cells; + this.names = names; + this.external = external; } } + // Table class public class Table { - public Page Page { get; set; } // Represents the page object in your document - public List Cells { get; set; } - public TableHeader Header { get; set; } - public List Chars { get; set; } + public Page page { get; set; } + public TextPage textpage { get; set; } + public List cells { get; set; } + public TableHeader header { get; set; } - public Table(Page page, List cells, List chars) + public Table(Page page, List cells) { - this.Page = page; - this.Cells = cells; - this.Chars = chars; - this.Header = _get_header(); + this.page = page; + this.cells = cells; + this.textpage = null; + this.header = GetHeader(); } - public BBox Bbox + public Rect bbox { get { - var c = this.Cells; - return new BBox( - c.Min(cell => cell.x0), - c.Min(cell => cell.top), - c.Max(cell => cell.x1), - c.Max(cell => cell.bottom) + if (cells == null || cells.Count == 0) + return null; + return new Rect( + cells.Min(c => c.X0), + cells.Min(c => c.Y0), + cells.Max(c => c.X1), + cells.Max(c => c.Y1) ); } } - public List Rows + public List rows { get { - var sorted = this.Cells.OrderBy(cell => cell.top).ThenBy(cell => cell.x0).ToList(); - var xCoordinates = sorted.Select(cell => cell.x0).Distinct().OrderBy(x => x).ToList(); + var sorted = cells.OrderBy(c => c.Y0).ThenBy(c => c.X0).ToList(); + var xs = cells.Select(c => c.X0).Distinct().OrderBy(x => x).ToList(); var rows = new List(); - foreach (var group in sorted.GroupBy(cell => cell.top)) + foreach (var group in sorted.GroupBy(c => c.Y0)) { - var rowCells = group.ToDictionary(cell => cell.x0, cell => cell); - var row = new TableRow(rowCells.Values.ToList()); + var rowCells = group.OrderBy(c => c.X0).ToList(); + var xdict = rowCells.ToDictionary(c => c.X0, c => c); + var row = new TableRow(xs.Select(x => xdict.ContainsKey(x) ? xdict[x] : null).ToList()); rows.Add(row); } @@ -2009,416 +2298,448 @@ public List Rows } } - public int RowCount => Rows.Count; - public int ColCount => Rows.Max(row => row.Cells.Count); + public int row_count + { + get { return rows.Count; } + } + + public int col_count + { + get { return rows.Count > 0 ? rows.Max(r => r.cells.Count) : 0; } + } public List> Extract(Dictionary kwargs = null) { - var chars = Chars; // Placeholder for actual char extraction logic + if (kwargs == null) + kwargs = new Dictionary(); + + var chars = TableGlobals.CHARS; var tableArr = new List>(); - bool char_in_bbox(Character character, BBox bbox) + bool CharInBbox(CharDict char_, Rect bbox) { - // Calculate the vertical and horizontal midpoints of the character's bounding box - float vMid = (character.top + character.bottom) / 2; - float hMid = (character.x0 + character.x1) / 2; - - // Get the coordinates from the bounding box - float x0 = bbox.x0; - float top = bbox.top; - float x1 = bbox.x1; - float bottom = bbox.bottom; - - // Check if the character's midpoint is within the bounding box - return (hMid >= x0 && hMid < x1 && vMid >= top && vMid < bottom); + float v_mid = (char_.top + char_.bottom) / 2; + float h_mid = (char_.x0 + char_.x1) / 2; + return h_mid >= bbox.X0 && h_mid < bbox.X1 && v_mid >= bbox.Y0 && v_mid < bbox.Y1; } - foreach (var row in Rows) + foreach (var row in rows) { - var rowArr = new List(); - var rowChars = chars.Where(c => char_in_bbox(c, row.Bbox)).ToList(); + var arr = new List(); + var rowChars = chars.Where(c => CharInBbox(c, row.bbox)).ToList(); - foreach (BBox cell in row.Cells) + foreach (var cell in row.cells) { - string cellText = string.Empty; - if (cell != null) + if (cell == null) + { + arr.Add(null); + } + else { - var cellChars = rowChars.Where(c => char_in_bbox(c, cell)).ToList(); - if (cellChars.Any()) + var cellChars = rowChars.Where(c => CharInBbox(c, cell)).ToList(); + if (cellChars.Count > 0) { - if (kwargs == null) - { - kwargs = new Dictionary(); - } - kwargs["x_shift"] = cell.x0; - kwargs["y_shift"] = cell.top; - - // Check if "layout" is in kwargs and update layout_width and layout_height accordingly - if (kwargs.ContainsKey("layout")) + var cellKwargs = new Dictionary(kwargs); + cellKwargs["x_shift"] = cell.X0; + cellKwargs["y_shift"] = cell.Y0; + if (cellKwargs.ContainsKey("layout")) { - kwargs["layout_width"] = cell.x1 - cell.x0; - kwargs["layout_height"] = cell.bottom - cell.top; + cellKwargs["layout_width"] = cell.X1 - cell.X0; + cellKwargs["layout_height"] = cell.Y1 - cell.Y0; } - // Call your text extraction logic here - cellText = extract_text(cellChars, kwargs); + var cellText = ExtractText(cellChars, cellKwargs); + arr.Add(cellText); } else { - cellText = string.Empty; + arr.Add(""); } } - rowArr.Add(cellText); } - tableArr.Add(rowArr); + tableArr.Add(arr); } return tableArr; } - // Output table content as a string in Github-markdown format. - // If clean is true, markdown syntax is removed from cell content. - public string ToMarkdown(bool clean = true) + private string ExtractText(List chars, Dictionary kwargs) + { + return TextExtractionHelpers.ExtractText(chars, kwargs); + } + + public string ToMarkdown(bool clean = false, bool fillEmpty = true) { - StringBuilder output = new StringBuilder("|"); + var output = new StringBuilder(); + output.Append("|"); // Start with "|" as in Python line 1604 + int rows = row_count; + int cols = col_count; - // Generate header string and MD underline - for (int i = 0; i < Header.Names.Count; i++) + // cell coordinates + var cellBoxes = this.rows.Select(r => r.cells.ToList()).ToList(); + + // cell text strings + var cells = new List>(); + for (int i = 0; i < rows; i++) { - string name = Header.Names[i]; - if (string.IsNullOrEmpty(name)) // Generate a name if empty + cells.Add(new List()); + for (int colIdx = 0; colIdx < cols; colIdx++) { - name = $"Col{i + 1}"; + cells[i].Add(null); } + } - name = name.Replace("\n", " "); // Remove any line breaks + for (int i = 0; i < cellBoxes.Count; i++) + { + for (int colIdx = 0; colIdx < cellBoxes[i].Count && colIdx < cols; colIdx++) + { + if (cellBoxes[i][colIdx] != null) + { + cells[i][colIdx] = TableHelpers.ExtractCells(textpage, cellBoxes[i][colIdx], markdown: true); + } + } + } - if (clean) // Remove sensitive syntax + if (fillEmpty) + { + // for rows, copy content from left to right + for (int rowIdx = 0; rowIdx < rows; rowIdx++) { - name = WebUtility.HtmlEncode(name.Replace("-", "-")); + for (int i = 0; i < cols - 1; i++) + { + if (cells[rowIdx][i + 1] == null) + { + cells[rowIdx][i + 1] = cells[rowIdx][i]; + } + } } - output.Append(name + "|"); + // for columns, copy top to bottom + for (int i = 0; i < cols; i++) + { + for (int rowIdx = 0; rowIdx < rows - 1; rowIdx++) + { + if (cells[rowIdx + 1][i] == null) + { + cells[rowIdx + 1][i] = cells[rowIdx][i]; + } + } + } } - output.Append("\n"); - - // Generate the markdown header line - for (int i = 0; i < ColCount; i++) + // generate header string and MD separator + // Note: Python assumes self.header always exists, so we do the same + for (int i = 0; i < header.names.Count; i++) { - output.Append("---|"); + string name = header.names[i]; + if (string.IsNullOrEmpty(name)) + { + name = $"Col{i + 1}"; + } + name = name.Replace("\n", "
"); + if (clean) + { + name = System.Security.SecurityElement.Escape(name.Replace("-", "-")); + } + output.Append(name + "|"); } output.Append("\n"); + // insert GitHub header line separator + output.Append("|" + string.Join("|", Enumerable.Range(0, col_count).Select(_ => "---")) + "|\n"); - // Skip first row in details if header is part of the table - int j = (Header.External ? 0 : 1); + // skip first row in details if header is part of the table + int startRow = header.external ? 0 : 1; - // Iterate over detail rows - var rows = Extract(); // Assuming Extract() is a method that returns a List> - foreach (var row in rows.GetRange(j, rows.Count - j)) + // iterate over detail rows + for (int i = startRow; i < rows; i++) { - string line = "|"; - foreach (var cell in row) + output.Append("|"); + for (int k = 0; k < cols; k++) { - // Output null cells with empty string - string cellContent = cell ?? ""; - cellContent = cellContent.Replace("\n", " "); // Remove line breaks - if (clean) // Remove sensitive syntax + string cell = cells[i][k]; + if (cell == null) + cell = ""; + if (clean) { - cellContent = WebUtility.HtmlEncode(cellContent.Replace("-", "-")); + cell = System.Security.SecurityElement.Escape(cell.Replace("-", "-")); } - line += cellContent + "|"; + output.Append(cell + "|"); } - line += "\n"; - output.Append(line); + output.Append("\n"); } - return output.ToString() + "\n"; } - - // Identify the table header. - // *** PyMuPDF extension. *** - // Starting from the first line above the table upwards, check if it - // qualifies to be part of the table header. - // Criteria include: - // * A one-line table never has an extra header. - // * Column borders must not intersect any word. If this happens, all - // text of this line and above of it is ignored. - // * No excess inter-line distance: If a line further up has a distance - // of more than 1.5 times of its font size, it will be ignored and - // all lines above of it. - // * Must have same text properties. - // * Starting with the top table line, a bold text property cannot change - // back to non-bold. - // If not all criteria are met (or there is no text above the table), - // the first table row is assumed to be the header. - private TableHeader _get_header(int yTolerance = 3) - { - // Check if row 0 has bold text anywhere. - // If this is true, then any non - bold text in lines above disqualify - // these lines as header. - // bbox is the(potentially repaired) row 0 bbox. - // Returns True or False - bool top_row_is_bold(BBox _bbox) - { - List blocks = Page.GetText("dict", clip: new Rect(_bbox.x0, _bbox.top, _bbox.x1, _bbox.bottom), - flags: (int)TextFlagsExtension.TEXTFLAGS_TEXT).Blocks; - foreach (Block block in blocks) - { - foreach (Line line in block.Lines) - { - foreach (Span span in line.Spans) - { - if (((int)span.Flags & 16) != 0) - { - return true; - } - } - } + + // to_pandas - Return a pandas DataFrame version of the table + // Note: This would require the pandas.NET library or similar + // For C#, users can convert the Extract() result to their preferred data structure + public object ToPandas(Dictionary kwargs = null) + { + // In Python: returns pandas.DataFrame + // In C#: Could return DataTable, or users can use Extract() and convert manually + throw new NotImplementedException("ToPandas is not implemented in C#. Use Extract() and convert to your preferred data structure (e.g., DataTable)."); + } + + private string ExtractCells(TextPage textpage, Rect cell, bool markdown = false) + { + return TableHelpers.ExtractCells(textpage, cell, markdown); + } + + private TableHeader GetHeader(float yTolerance = 3.0f) + { + float yDelta = yTolerance; + + // Helper function: Check if top row has different background color + bool TopRowBgColor() + { + try + { + var bbox0 = rows[0].bbox; + var bboxt = new Rect(bbox0.X0, bbox0.Y0 - bbox0.Height, bbox0.X1, bbox0.Y0); + var (_, topColor0) = page.GetPixmap(clip: bbox0).ColorTopUsage(); + var (_, topColort) = page.GetPixmap(clip: bboxt).ColorTopUsage(); + return !topColor0.SequenceEqual(topColort); + } + catch + { + return false; } - return false; } - if (Rows.Count == 0) + // Helper function: Check if row contains bold text + bool RowHasBold(Rect rowBbox) { - return null; + return TableGlobals.CHARS.Any(c => + TableHelpers.RectInRect(new Rect(c.x0, c.y0, c.x1, c.y1), rowBbox) && c.bold); } - var row = Rows[0]; - var cells = row.Cells; - var bbox = new BBox(row.Bbox.x0, row.Bbox.top, row.Bbox.x1, row.Bbox.bottom); + if (rows == null || rows.Count == 0) + return null; - TableHeader headerTopRow = new TableHeader(bbox, cells, Extract()[0], false); + var row = rows[0]; + var cells = row.cells; + var bbox = row.bbox; + + // Return this if we determine that the top row is the header + var extractResult = Extract(); + var headerTopRow = new TableHeader( + bbox, + cells, + extractResult.Count > 0 ? extractResult[0] : new List(), + false + ); - // One-line tables have no extra header - if (Rows.Count < 2) + // 1-line tables have no extra header + if (rows.Count < 2) return headerTopRow; + // 1-column tables have no extra header if (cells.Count < 2) return headerTopRow; - // column (x) coordinates - var colX = new List(); - foreach (var cell in cells.Take(cells.Count - 1)) - { - if (cell != null) - { - colX.Add(cell.x1); // Assuming X1 is the right edge of the cell - } - } + // Assume top row is the header if second row is empty + var row2 = rows[1]; + if (row2.cells.All(c => c == null)) + return headerTopRow; // Special check: is top row bold? - // If first line above table is not bold, but top-left table cell is bold, - // we take first table row as header - bool topRowBold = top_row_is_bold(bbox); + bool topRowBold = RowHasBold(bbox); + + // Assume top row is header if it is bold and any cell of 2nd row is non-bold + if (topRowBold && !RowHasBold(row2.bbox)) + return headerTopRow; + + if (TopRowBgColor()) + return headerTopRow; + + // Column coordinates (x1 values) in top row + var colX = cells.Take(cells.Count - 1).Select(c => c != null ? c.X1 : (float?)null).ToList(); - // clip = area above table - // We will inspect this area for text qualifying as column header. - BBox clip = new BBox(bbox.x0, bbox.top, bbox.x1, bbox.bottom); - clip.top = 0; // Start at the top of the page - clip.bottom = bbox.top; // End at the top of the table + // Clip = page area above the table + var clip = new Rect(bbox.X0, 0, bbox.X1, bbox.Y0); - var spans = new List(); - List clipBlocks = Page.GetText("dict", clip:new Rect(clip.x0, clip.top, clip.x1, clip.bottom), flags: (int)TextFlagsExtension.TEXTFLAGS_TEXT).Blocks; - foreach (Block block in clipBlocks) + // Get text blocks above table + dynamic pageInfo = page.GetText("dict", clip: clip, flags: (int)TextFlagsExtension.TEXTFLAGS_TEXT); + List blocks = pageInfo?.BLOCKS ?? new List(); + + // Non-empty, non-superscript spans above table, sorted descending by y1 + var spans = new List>(); + foreach (var block in blocks) { - foreach (Line line in block.Lines) + if (block.Lines == null) continue; + foreach (var line in block.Lines) { - foreach (Span span in line.Spans) + if (line.Spans == null) continue; + foreach (var span in line.Spans) { - int flag = (int)span.Flags; - if ((flag & 1) == 0 && !string.IsNullOrWhiteSpace(span.Text)) - { // ignore superscripts and empty text - spans.Add(span); + if (span.Bbox == null) continue; + string text = span.Text ?? ""; + bool isWhitespace = text.All(c => TableGlobals.WHITE_SPACES.Contains(c)); + bool isSuperscript = ((int)span.Flags & (int)FontStyle.TEXT_FONT_SUPERSCRIPT) != 0; + + if (!isWhitespace && !isSuperscript) + { + spans.Add(new Dictionary + { + { "text", text }, + { "bbox", new List { span.Bbox.X0, span.Bbox.Y0, span.Bbox.X1, span.Bbox.Y1 } }, + { "flags", span.Flags } + }); } } } } - var select = new List(); // y1 coordinates above, sorted descending - var lineHeights = new List(); // line heights above, sorted descending - var lineBolds = new List(); // bold indicator per line above, same sorting + spans = spans.OrderByDescending(s => ((List)s["bbox"])[3]).ToList(); - // spans sorted descending - spans = spans.OrderByDescending(s => s.Bbox.Y1).ToList(); + var select = new List(); + var lineHeights = new List(); + var lineBolds = new List(); - // walk through the spans and fill above 3 lists + // Walk through spans and fill the 3 lists for (int i = 0; i < spans.Count; i++) { - Span span = spans[i]; - float y1 = span.Bbox.Y1; // span bottom - float height = y1 - span.Bbox.Y0; // span bbox height - bool bold = ((int)span.Flags & 16) != 0; + var s = spans[i]; + var sbbox = s["bbox"] as List; + if (sbbox == null || sbbox.Count < 4) continue; + + float y1 = Convert.ToSingle(sbbox[3]); + float h = y1 - Convert.ToSingle(sbbox[1]); + bool bold = ((int)s["flags"] & (int)FontStyle.TEXT_FONT_BOLD) != 0; - // use first item to start the lists if (i == 0) { select.Add(y1); - lineHeights.Add(height); + lineHeights.Add(h); lineBolds.Add(bold); continue; } - // get last items from the 3 lists - float y0 = select.Last(); - float prevHeight = lineHeights.Last(); - bool prevBold = lineBolds.Last(); + float y0 = select[select.Count - 1]; + float h0 = lineHeights[lineHeights.Count - 1]; + bool bold0 = lineBolds[lineBolds.Count - 1]; - if (prevBold && !bold) - break; // stop if switching from bold to non-bold + if (bold0 && !bold) + break; - // if fitting in height of previous span, modify bbox - if (y0 - y1 <= yTolerance || Math.Abs((y0 - prevHeight) - span.Bbox.Y0) <= yTolerance) + if (y0 - y1 <= yDelta || Math.Abs((y0 - h0) - Convert.ToSingle(sbbox[1])) <= yDelta) { - span.Bbox = new Rect(span.Bbox.X0, y0 - prevHeight, span.Bbox.X1, y0); - spans[i] = span; + sbbox[1] = y0 - h0; + sbbox[3] = y0; + s["bbox"] = sbbox; + spans[i] = s; if (bold) lineBolds[lineBolds.Count - 1] = bold; continue; } - else if (y0 - y1 > 1.5 * prevHeight) + else if (y0 - y1 > 1.5 * h0) { - break; // stop if distance to previous line too large + break; } select.Add(y1); - lineHeights.Add(height); + lineHeights.Add(h); lineBolds.Add(bold); } - if (!select.Any()) // nothing above the table? + if (select.Count == 0) return headerTopRow; - select = select.Take(5).ToList(); // Only accept up to 5 lines in any header + select = select.Take(5).ToList(); - // take top row as header if text above table is too far apart - if (bbox.top - select.First() >= lineHeights.First()) + // Assume top row as header if text above is too far away + if (bbox.Y0 - select[0] >= lineHeights[0]) return headerTopRow; - // If top row is bold but line above is not, return top row as header - if (topRowBold && !lineBolds.First()) + // Accept top row as header if bold, but line above is not + if (topRowBold && !lineBolds[0]) return headerTopRow; - if (!spans.Any()) // nothing left above the table, return top row + if (spans.Count == 0) return headerTopRow; // Re-compute clip above table - BBox nclip = new BBox(0,0,0,0); - foreach (var span in spans.Where(s => s.Bbox.Y1 >= select.Last())) + var nclip = new Rect(0, 0, 0, 0); + foreach (var s in spans.Where(s => Convert.ToSingle(((List)s["bbox"])[3]) >= select[select.Count - 1])) { - nclip = nclip.Union(new BBox(span.Bbox.X0, span.Bbox.Y0, span.Bbox.X1, span.Bbox.Y1)); + var sbbox = s["bbox"] as List; + if (sbbox != null && sbbox.Count >= 4) + { + var srect = new Rect( + Convert.ToSingle(sbbox[0]), + Convert.ToSingle(sbbox[1]), + Convert.ToSingle(sbbox[2]), + Convert.ToSingle(sbbox[3]) + ); + nclip = nclip | srect; + } } - if (!nclip.IsEmpty()) + if (!nclip.IsEmpty) clip = nclip; - clip.bottom = bbox.bottom; // make sure we still include every word above + clip.Y1 = bbox.Y0; // Confirm that no word in clip is intersecting a column separator - List clipWords = Page.GetTextWords(clip: new Rect(clip.x0, clip.top, clip.x1, clip.bottom)); - List wordRects = clipWords.Select(w => new BBox(w.X0, w.Y0, w.X1, w.Y1)).ToList(); - List wordTops = wordRects.Select(r => r.top).Distinct().OrderByDescending(top => top).ToList(); + // Get words from textpage or page + var textpageForWords = page.GetTextPage(clip: clip); + var words = textpageForWords.ExtractWords(); + var wordRects = words.Select(w => new Rect(w.X0, w.Y0, w.X1, w.Y1)).ToList(); + var wordTops = wordRects.Select(r => r.Y0).Distinct().OrderByDescending(y => y).ToList(); - List wordSelect = new List(); + select.Clear(); + // Exclude lines with words that intersect a column border foreach (var top in wordTops) { - bool intersecting = false; - foreach (var x in colX) - { - if (x >= 0f) - { - foreach (var r in wordRects) - { - // Check if word intersects a column border - if (r.top == top && r.x0 < x && r.x1 > x) - { - intersecting = true; - break; - } - } - } - if (intersecting) - { - break; - } - } + bool hasIntersecting = colX.Any(x => + x.HasValue && wordRects.Any(r => r.Y0 == top && r.X0 < x.Value && r.X1 > x.Value)); - if (!intersecting) + if (!hasIntersecting) { - wordSelect.Add(top); + select.Add(top); } else { - // Detected a word crossing a column border break; } } - if (wordSelect.Count == 0) // nothing left over: return first row + if (select.Count == 0) return headerTopRow; - BBox hdrBbox = clip; // compute the header cells - hdrBbox.top = wordSelect.Last(); // hdr_bbox.top is the smallest top coordinate of words + var hdrBbox = new Rect(clip.X0, select[select.Count - 1], clip.X1, clip.Y1); + hdrBbox.X0 = this.bbox.X0; + hdrBbox.X1 = this.bbox.X1; - List hdrCells = new List(); - foreach (var c in cells) + var hdrCells = cells.Select(c => + c != null ? new Rect(c.X0, hdrBbox.Y0, c.X1, hdrBbox.Y1) : (Rect)null + ).ToList(); + + // Column names: no line breaks, no excess spaces + var hdrNames = hdrCells.Select(c => { - if (c != null) + if (c == null) return ""; + try { - hdrCells.Add(new BBox(c.x0, hdrBbox.top, c.x1, hdrBbox.bottom)); + return page.GetTextbox(c).Replace("\n", " ").Replace(" ", " ").Trim(); } - else + catch { - hdrCells.Add(null); + return ""; } - } - - // adjust left/right of header bbox - hdrBbox.x0 = Bbox.x0; - hdrBbox.x1 = Bbox.x1; - - // List to store the processed header names - List hdrNames = new List(); - - // Process each header cell - foreach (var c in hdrCells) - { - string cText = Page.GetTextbox(new Rect(c.x0, c.top, c.x1, c.bottom)); - string name = c != null ? cText.Replace("\n", " ").Replace(" ", " ").Trim() : ""; - hdrNames.Add(name); - } + }).ToList(); return new TableHeader(hdrBbox, hdrCells, hdrNames, true); } - - private string ExtractText(List> cellChars, Dictionary kwargs) - { - // Logic to extract text from characters inside a bounding box - // Placeholder logic - return string.Join(" ", cellChars.Select(c => c["text"].ToString())); - } } - public class TableSettings - { - static readonly string[] NON_NEGATIVE_SETTINGS = { - "snap_tolerance", - "snap_x_tolerance", - "snap_y_tolerance", - "join_tolerance", - "join_x_tolerance", - "join_y_tolerance", - "edge_min_length", - "min_words_vertical", - "min_words_horizontal", - "intersection_tolerance", - "intersection_x_tolerance", - "intersection_y_tolerance", - }; + // TableSettings class + public class TableSettings + { public string vertical_strategy { get; set; } = "lines"; public string horizontal_strategy { get; set; } = "lines"; - public List explicit_vertical_lines { get; set; } = null; - public List explicit_horizontal_lines { get; set; } = null; + public List explicit_vertical_lines { get; set; } = null; + public List explicit_horizontal_lines { get; set; } = null; public float snap_tolerance { get; set; } = TableFlags.TABLE_DEFAULT_SNAP_TOLERANCE; public float snap_x_tolerance { get; set; } = TableFlags.TABLE_UNSET; public float snap_y_tolerance { get; set; } = TableFlags.TABLE_UNSET; @@ -2435,119 +2756,79 @@ public class TableSettings public TableSettings PostInit() { - // Clean up user-provided table settings. - // Validates that the table settings provided consists of acceptable values and - // returns a cleaned up version.The cleaned up version fills out the missing - // values with the default values in the provided settings. - // TODO: Can be further used to validate that the values are of the correct - // type.For example, raising a value error when a non-boolean input is - // provided for the key ``keep_blank_chars``. - // :param table_settings: User - provided table settings. - // :returns: A cleaned up version of the user - provided table settings. - // :raises ValueError: When an unrecognised key is provided. - - foreach (string setting in NON_NEGATIVE_SETTINGS) + // Validate non-negative settings + var nonNegativeSettings = new[] { - PropertyInfo property = typeof(TableSettings).GetProperty(setting); - if (property != null) - { - var value = property.GetValue(this); - if ((float)value < 0) - { - throw new ArgumentException("Table setting " + setting + " cannot be negative"); - } - } - else - { - throw new ArgumentException("Table setting not include property " + setting); - } - } + "snap_tolerance", "snap_x_tolerance", "snap_y_tolerance", + "join_tolerance", "join_x_tolerance", "join_y_tolerance", + "edge_min_length", "min_words_vertical", "min_words_horizontal", + "intersection_tolerance", "intersection_x_tolerance", "intersection_y_tolerance" + }; - foreach (string orientation in new string[] { "horizontal", "vertical" }) + foreach (var setting in nonNegativeSettings) { - PropertyInfo property = typeof(TableSettings).GetProperty(orientation + "_strategy"); - if (property != null) - { - var strategy = property.GetValue(this); - if (Array.IndexOf(TableFlags.TABLE_STRATEGIES, strategy) == -1) - { - throw new ArgumentException(orientation + "_strategy must be one of " + string.Join(",", TableFlags.TABLE_STRATEGIES)); - } - } - else + var value = (float)GetType().GetProperty(setting).GetValue(this); + if (value < 0) { - throw new ArgumentException("Table setting not include property " + orientation + "_strategy"); + throw new ArgumentException($"Table setting '{setting}' cannot be negative"); } } - if (this.text_settings == null) - this.text_settings = new Dictionary(); - - // This next section is for backwards compatibility - foreach (string attr in new string[] { "x_tolerance", "y_tolerance" }) + // Validate strategies + if (!TableFlags.TABLE_STRATEGIES.Contains(vertical_strategy)) { - if (!this.text_settings.ContainsKey(attr)) - { - this.text_settings[attr] = this.text_settings.ContainsKey("tolerance") ? this.text_settings["tolerance"] : 3.0f; - } + throw new ArgumentException($"vertical_strategy must be one of {{{string.Join(",", TableFlags.TABLE_STRATEGIES)}}}"); } - if (this.text_settings.ContainsKey("tolerance")) + if (!TableFlags.TABLE_STRATEGIES.Contains(horizontal_strategy)) { - this.text_settings.Remove("tolerance"); + throw new ArgumentException($"horizontal_strategy must be one of {{{string.Join(",", TableFlags.TABLE_STRATEGIES)}}}"); } - // End of that section - var mappings = new (string attr, string fallback)[] - { - ("snap_x_tolerance", "snap_tolerance"), - ("snap_y_tolerance", "snap_tolerance"), - ("join_x_tolerance", "join_tolerance"), - ("join_y_tolerance", "join_tolerance"), - ("intersection_x_tolerance", "intersection_tolerance"), - ("intersection_y_tolerance", "intersection_tolerance") - }; - foreach (var (attr, fallback) in mappings) + if (text_settings == null) { - // Get the property info for the current attribute and fallback - PropertyInfo attrProperty = typeof(TableSettings).GetProperty(attr); - PropertyInfo fallbackProperty = typeof(TableSettings).GetProperty(fallback); - - if (attrProperty != null && fallbackProperty != null) - { - float attrValue = (float)attrProperty.GetValue(this); - if (attrValue == TableFlags.TABLE_UNSET) - { - float fallbackValue = (float)fallbackProperty.GetValue(this); - attrProperty.SetValue(this, fallbackValue); - } - } + text_settings = new Dictionary(); } + // Set defaults for unset tolerances + if (snap_x_tolerance == TableFlags.TABLE_UNSET) + snap_x_tolerance = snap_tolerance; + if (snap_y_tolerance == TableFlags.TABLE_UNSET) + snap_y_tolerance = snap_tolerance; + if (join_x_tolerance == TableFlags.TABLE_UNSET) + join_x_tolerance = join_tolerance; + if (join_y_tolerance == TableFlags.TABLE_UNSET) + join_y_tolerance = join_tolerance; + if (intersection_x_tolerance == TableFlags.TABLE_UNSET) + intersection_x_tolerance = intersection_tolerance; + if (intersection_y_tolerance == TableFlags.TABLE_UNSET) + intersection_y_tolerance = intersection_tolerance; + return this; } - public static TableSettings resolve(object settings = null) + public static TableSettings Resolve(object settings = null) { if (settings == null) { - return new TableSettings(); + return new TableSettings().PostInit(); } - else if (settings is TableSettings tableSettings) + + if (settings is TableSettings ts) { - return tableSettings; + return ts.PostInit(); } - else if (settings is Dictionary settingsDict) + + if (settings is Dictionary dict) { var coreSettings = new Dictionary(); var textSettings = new Dictionary(); - // Loop over the dictionary and separate text_ settings - foreach (var kvp in settingsDict) + foreach (var kvp in dict) { if (kvp.Key.StartsWith("text_")) { - textSettings[kvp.Key.Substring(5)] = kvp.Value.ToString(); + textSettings[kvp.Key.Substring(5)] = kvp.Value; } else { @@ -2555,669 +2836,834 @@ public static TableSettings resolve(object settings = null) } } - // Add textSettings to coreSettings before passing to the constructor coreSettings["text_settings"] = textSettings; - var instance = new TableSettings(); + var tableSettings = new TableSettings(); foreach (var kvp in coreSettings) { - var property = instance.GetType().GetProperty(kvp.Key); - - if (property != null) + var prop = typeof(TableSettings).GetProperty(kvp.Key); + if (prop != null && prop.CanWrite) { - property.SetValue(instance, kvp.Value); + prop.SetValue(tableSettings, kvp.Value); } - else + } + + return tableSettings.PostInit(); + } + + throw new ArgumentException($"Cannot resolve settings: {settings}"); + } + } + + // FindTables function - C# port of find_tables from table.py + public static class TableFinderHelper + { + /// + /// Find tables on a page and return a TableFinder object. + /// This is the C# port of the find_tables function from table.py. + /// + public static TableFinder FindTables( + Page page, + Rect clip = null, + string vertical_strategy = "lines", + string horizontal_strategy = "lines", + List vertical_lines = null, + List horizontal_lines = null, + float snap_tolerance = TableFlags.TABLE_DEFAULT_SNAP_TOLERANCE, + float? snap_x_tolerance = null, + float? snap_y_tolerance = null, + float join_tolerance = TableFlags.TABLE_DEFAULT_JOIN_TOLERANCE, + float? join_x_tolerance = null, + float? join_y_tolerance = null, + float edge_min_length = 3.0f, + float min_words_vertical = TableFlags.TABLE_DEFAULT_MIN_WORDS_VERTICAL, + float min_words_horizontal = TableFlags.TABLE_DEFAULT_MIN_WORDS_HORIZONTAL, + float intersection_tolerance = 3.0f, + float? intersection_x_tolerance = null, + float? intersection_y_tolerance = null, + float text_tolerance = 3.0f, + float text_x_tolerance = 3.0f, + float text_y_tolerance = 3.0f, + string strategy = null, + List> add_lines = null, + List add_boxes = null, + List paths = null + ) + { + // Clear global state + TableGlobals.CHARS.Clear(); + TableGlobals.EDGES.Clear(); + TableGlobals.TEXTPAGE = null; + + // Handle page rotation + int oldRotation = page.Rotation; + bool needsRotationReset = oldRotation != 0; + Rect oldMediabox = null; + + if (needsRotationReset) + { + oldMediabox = page.MediaBox; + page.SetRotation(0); + // Note: In Python, page_rotation_set0 also handles xref and mediabox changes + // For now, we'll just reset rotation - full implementation may require more complex handling + } + + // Handle UNSET values (None in Python becomes null in C#, use TABLE_UNSET) + float snapX = snap_x_tolerance ?? TableFlags.TABLE_UNSET; + float snapY = snap_y_tolerance ?? TableFlags.TABLE_UNSET; + float joinX = join_x_tolerance ?? TableFlags.TABLE_UNSET; + float joinY = join_y_tolerance ?? TableFlags.TABLE_UNSET; + float interX = intersection_x_tolerance ?? TableFlags.TABLE_UNSET; + float interY = intersection_y_tolerance ?? TableFlags.TABLE_UNSET; + + if (strategy != null) + { + vertical_strategy = strategy; + horizontal_strategy = strategy; + } + + Dictionary settings = new Dictionary + { + { "vertical_strategy", vertical_strategy }, + { "horizontal_strategy", horizontal_strategy }, + { "explicit_vertical_lines", vertical_lines }, + { "explicit_horizontal_lines", horizontal_lines }, + { "snap_tolerance", snap_tolerance }, + { "snap_x_tolerance", snapX }, + { "snap_y_tolerance", snapY }, + { "join_tolerance", join_tolerance }, + { "join_x_tolerance", joinX }, + { "join_y_tolerance", joinY }, + { "edge_min_length", edge_min_length }, + { "min_words_vertical", min_words_vertical }, + { "min_words_horizontal", min_words_horizontal }, + { "intersection_tolerance", intersection_tolerance }, + { "intersection_x_tolerance", interX }, + { "intersection_y_tolerance", interY }, + { "text_tolerance", text_tolerance }, + { "text_x_tolerance", text_x_tolerance }, + { "text_y_tolerance", text_y_tolerance } + }; + + TableFinder tbf = null; + try + { + // Get layout information if available + List layoutBoxes = new List(); + try + { + // Try to get layout information - this may not be available in all MuPDF.NET versions + // In Python: page.get_layout() and page.layout_information + // For now, we'll skip this and proceed with table detection + } + catch + { + // Layout information not available, continue without it + } + + // Resolve settings + TableSettings tset = TableSettings.Resolve(settings); + + // Create character list + TextPage textpage = TablePageProcessing.MakeChars(page, clip: clip); + TableGlobals.TEXTPAGE = textpage; + + // Create edges + TablePageProcessing.MakeEdges( + page, + clip: clip, + tset: tset, + paths: paths, + addLines: add_lines, + addBoxes: add_boxes + ); + + // Create TableFinder + tbf = new TableFinder(page, tset); + tbf.textpage = textpage; + + // Filter tables based on layout boxes if available + if (layoutBoxes.Count > 0) + { + tbf.tables = tbf.tables.Where(tab => + layoutBoxes.Any(box => IoU(tab.bbox, box) >= 0.6f) + ).ToList(); + + // Find layout boxes that don't match any found table + List unmatchedBoxes = layoutBoxes.Where(box => + tbf.tables.All(tab => IoU(box, tab.bbox) < 0.6f) + ).ToList(); + + // Create tables from unmatched layout boxes + if (unmatchedBoxes.Count > 0) { - throw new ArgumentException($"Invalid parameter: {kvp.Key}"); + // Extract words for make_table_from_bbox + var words = textpage.ExtractWords(); + List wordRects = words.Select(w => new Rect(w.X0, w.Y0, w.X1, w.Y1)).ToList(); + + // Create a textpage with TABLE_DETECTOR_FLAGS for make_table_from_bbox + TextPage tp2 = page.GetTextPage(flags: TableGlobals.TABLE_DETECTOR_FLAGS); + + foreach (Rect rect in unmatchedBoxes) + { + List cells = TableHelpers.MakeTableFromBbox(tp2, wordRects, rect); + if (cells.Count > 0) + { + tbf.tables.Add(new Table(page, cells)); + } + } } } - return instance.PostInit(); + // Set textpage for all tables + foreach (var table in tbf.tables) + { + table.textpage = textpage; + } } - else + catch (Exception ex) + { + // Log exception (equivalent to pymupdf.message in Python) + System.Diagnostics.Debug.WriteLine($"find_tables: exception occurred: {ex.Message}"); + return null; + } + finally { - throw new ArgumentException($"Cannot resolve settings: {settings}"); + if (needsRotationReset && oldRotation != 0) + { + page.SetRotation(oldRotation); + // Note: Full page_rotation_reset would also restore mediabox and xref + } } + + return tbf; + } + + /// + /// Compute intersection over union (IoU) of two rectangles. + /// + private static float IoU(Rect r1, Rect r2) + { + float ix = Math.Max(0, Math.Min(r1.X1, r2.X1) - Math.Max(r1.X0, r2.X0)); + float iy = Math.Max(0, Math.Min(r1.Y1, r2.Y1) - Math.Max(r1.Y0, r2.Y0)); + float intersection = ix * iy; + + if (intersection == 0) + return 0; + + float area1 = (r1.X1 - r1.X0) * (r1.Y1 - r1.Y0); + float area2 = (r2.X1 - r2.X0) * (r2.Y1 - r2.Y0); + return intersection / (area1 + area2 - intersection); } } + // TableFinder class public class TableFinder { - private readonly Page page; - private readonly TableSettings settings; - private readonly List edges; - private readonly Dictionary intersections; - private readonly List cells; - private readonly List
tables; - - private TextPage TEXTPAGE; - private List EDGES; - private List CHARS; - - public TableFinder(Page page, Rect clip, TableSettings settings = null) - { - TEXTPAGE = page.GetTextPage(clip, flags: (int)TextFlagsExtension.TEXTFLAGS_TEXT); - TEXTPAGE.Parent = page; - EDGES = new List(); - CHARS = new List(); - make_chars(page, clip); - make_edges(page, clip, settings); + public Page page { get; set; } + public TextPage textpage { get; set; } + public TableSettings settings { get; set; } + public List edges { get; set; } + public Dictionary, Dictionary>> intersections { get; set; } + public List cells { get; set; } + public List
tables { get; set; } + + public TableFinder(Page page, TableSettings settings = null) + { this.page = page; - this.settings = settings; - this.edges = get_edges(); - this.intersections = edges_to_intersections(this.edges, + this.settings = settings ?? TableSettings.Resolve(); + this.edges = GetEdges(); + this.intersections = EdgeProcessing.EdgesToIntersections( + this.edges, this.settings.intersection_x_tolerance, - this.settings.intersection_y_tolerance); - this.cells = intersections_to_cells(this.intersections); - this.tables = new List
(); - - foreach (var cellGroup in cells_to_tables(this.page, this.cells)) - { - this.tables.Add(new Table(this.page, cellGroup, CHARS)); - } + this.settings.intersection_y_tolerance + ); + this.cells = EdgeProcessing.IntersectionsToCells(this.intersections); + var cellGroups = EdgeProcessing.CellsToTables(this.page, this.cells); + this.tables = cellGroups.Select(cg => new Table(this.page, cg)).ToList(); } - private List get_edges() + private List GetEdges() { var settings = this.settings; + var edges = new List(); - var strategy = settings.vertical_strategy; - if (strategy == "explicit") - { - var lines = settings.explicit_vertical_lines; - if (lines.Count < 2) - { - throw new Exception("If vertical_strategy == 'explicit', " + - "explicit_vertical_lines " + - "must be specified as a list/tuple of two or more " + - "floats/ints."); - } - } - strategy = settings.horizontal_strategy; - if (strategy == "explicit") + // Validate explicit strategies + foreach (string orientation in new[] { "vertical", "horizontal" }) { - var lines = settings.explicit_horizontal_lines; - if (lines.Count < 2) + string strategy = orientation == "vertical" ? settings.vertical_strategy : settings.horizontal_strategy; + if (strategy == "explicit") { - throw new Exception("If horizontal_strategy == 'explicit', " + - "explicit_horizontal_lines " + - "must be specified as a list/tuple of two or more " + - "floats/ints."); + var lines = orientation == "vertical" ? settings.explicit_vertical_lines : settings.explicit_horizontal_lines; + if (lines == null || lines.Count < 2) + { + throw new ArgumentException( + $"If {orientation}_strategy == 'explicit', " + + $"explicit_{orientation}_lines must be specified as a list of two or more edges."); + } } } - string v_strat = settings.vertical_strategy; - string h_strat = settings.horizontal_strategy; + string vStrat = settings.vertical_strategy; + string hStrat = settings.horizontal_strategy; - List words = new List(); - if (v_strat == "text" || h_strat == "text") - words = extract_words(CHARS, settings.text_settings); + List> words = new List>(); + if (vStrat == "text" || hStrat == "text") + { + words = TextExtractionHelpers.ExtractWords(TableGlobals.CHARS, settings.text_settings ?? new Dictionary()); + } - List v_explicit = new List(); + // Vertical edges + var vExplicit = new List(); if (settings.explicit_vertical_lines != null) { foreach (var desc in settings.explicit_vertical_lines) { - if (desc is Edge descEdge) + if (desc is float x) { - foreach (Edge e in obj_to_edges(descEdge)) + vExplicit.Add(new Edge + { + x0 = x, + x1 = x, + top = page.Rect.Y0, + bottom = page.Rect.Y1, + height = page.Rect.Height, + orientation = "v" + }); + } + else if (desc is Dictionary dict) + { + // Convert dictionary to Edge (similar to obj_to_edges in Python) + var convertedEdges = EdgeProcessing.ObjToEdges(dict); + foreach (var e in convertedEdges) { if (e.orientation == "v") - v_explicit.Add(e); + vExplicit.Add(e); } } + else if (desc is Edge edge) + { + if (edge.orientation == "v") + vExplicit.Add(edge); + } } } - List v_base = new List(); - if (v_strat == "lines") - v_base = filter_edges(EDGES, "v"); - else if (v_strat == "lines_strict") - v_base = filter_edges(EDGES, "v", edgeType: "lines"); - else if (v_strat == "text") - v_base = words_to_edges_v(words, wordThreshold:(int)settings.min_words_vertical); - else if (v_strat == "explicit") - v_base.Clear(); - else - v_base.Clear(); + List vBase = new List(); + if (vStrat == "lines") + { + vBase = TableGlobals.EDGES.Where(e => e.orientation == "v").ToList(); + } + else if (vStrat == "lines_strict") + { + vBase = TableGlobals.EDGES.Where(e => e.orientation == "v" && e.object_type == "line").ToList(); + } + else if (vStrat == "text") + { + vBase = EdgeProcessing.WordsToEdgesV(words, (int)settings.min_words_vertical); + } - List v = v_base.Concat(v_explicit).ToList(); + var v = vBase.Concat(vExplicit).ToList(); - List h_explicit = new List(); + // Horizontal edges + var hExplicit = new List(); if (settings.explicit_horizontal_lines != null) { foreach (var desc in settings.explicit_horizontal_lines) { - if (desc is Edge descEdge) + if (desc is float y) { - foreach (Edge e in obj_to_edges(descEdge)) + hExplicit.Add(new Edge + { + x0 = page.Rect.X0, + x1 = page.Rect.X1, + top = y, + bottom = y, + width = page.Rect.Width, + orientation = "h" + }); + } + else if (desc is Dictionary dict) + { + // Convert dictionary to Edge (similar to obj_to_edges in Python) + var convertedEdges = EdgeProcessing.ObjToEdges(dict); + foreach (var e in convertedEdges) { if (e.orientation == "h") - h_explicit.Add(e); + hExplicit.Add(e); } } + else if (desc is Edge edge) + { + if (edge.orientation == "h") + hExplicit.Add(edge); + } } } - List h_base = new List(); - if (h_strat == "lines") - h_base = filter_edges(EDGES, "h"); - else if (h_strat == "lines_strict") - h_base = filter_edges(EDGES, "h", edgeType: "lines"); - else if (h_strat == "text") - h_base = words_to_edges_h(words, wordThreshold:(int)settings.min_words_horizontal); - else if (h_strat == "explicit") - h_base.Clear(); - else - h_base.Clear(); - - List h = h_base.Concat(h_explicit).ToList(); + List hBase = new List(); + if (hStrat == "lines") + { + hBase = TableGlobals.EDGES.Where(e => e.orientation == "h").ToList(); + } + else if (hStrat == "lines_strict") + { + hBase = TableGlobals.EDGES.Where(e => e.orientation == "h" && e.object_type == "line").ToList(); + } + else if (hStrat == "text") + { + hBase = EdgeProcessing.WordsToEdgesH(words, (int)settings.min_words_horizontal); + } - List edges = new List(); - edges.AddRange(v); - edges.AddRange(h); + var h = hBase.Concat(hExplicit).ToList(); - edges = merge_edges( + edges = v.Concat(h).ToList(); + edges = EdgeProcessing.MergeEdges( edges, - snap_x_tolerance: settings.snap_x_tolerance, - snap_y_tolerance: settings.snap_y_tolerance, - join_x_tolerance: settings.join_x_tolerance, - join_y_tolerance: settings.join_y_tolerance - ); + settings.snap_x_tolerance, + settings.snap_y_tolerance, + settings.join_x_tolerance, + settings.join_y_tolerance + ); + + return EdgeProcessing.FilterEdges(edges, minLength: settings.edge_min_length); + } - return filter_edges(edges, minLength: settings.edge_min_length); + public static List
FindTables(Page page, Rect clip, TableSettings settings) + { + var finder = new TableFinder(page, settings); + return finder.tables; } + public Table this[int i] { get { - int tcount = this.tables.Count; - if (i >= tcount || i < 0) - { + int tcount = tables.Count; + if (i >= tcount) throw new IndexOutOfRangeException("table not on page"); - } - return this.tables[i]; + while (i < 0) + i += tcount; + return tables[i]; } } + } - // Nullify page rotation. - // To correctly detect tables, page rotation must be zero. - // This function performs the necessary adjustments and returns information - // for reverting this changes. - private static Page page_rotation_set0(Page page) + // Functions for making chars and edges from page + internal static class TablePageProcessing + { + // make_chars - Extract text as "rawdict" to fill CHARS + internal static TextPage MakeChars(Page page, Rect clip = null) { - Rect mediabox = page.MediaBox; - int rot = page.Rotation; // contains normalized rotation value - // need to derotate the page's content - Rect mb = page.MediaBox; // current mediabox - - Matrix mat0 = new Matrix(); - if (rot == 90) - { - // before derotation, shift content horizontally - mat0 = new Matrix(1, 0, 0, 1, mb.Y1 - mb.X1 - mb.X0 - mb.Y0, 0); - } - else if (rot == 270) - { - // before derotation, shift content vertically - mat0 = new Matrix(1, 0, 0, 1, 0, mb.X1 - mb.Y1 - mb.Y0 - mb.X0); - } - else - { - mat0 = new Matrix(1, 0, 0, 1, -2 * mb.X0, -2 * mb.Y0); - } - - // swap x- and y-coordinates - if (rot == 90 || rot == 270) - { - float x0 = mb.X0; - float y0 = mb.Y0; - float x1 = mb.X1; - float y1 = mb.Y1; - mb.X0 = y0; - mb.Y0 = x0; - mb.X1 = y1; - mb.X1 = x1; - page.SetMediaBox(mb); - } - - page.SetRotation(0); - - return page; - } + int pageNumber = page.Number + 1; + float pageHeight = page.Rect.Height; + var ctm = page.TransformationMatrix; - private void make_chars(Page page, Rect clip = null) - { - int page_number = page.Number + 1; - float page_height = page.Rect.Height; - Matrix ctm = page.TransformationMatrix; - float doctop_base = page_height * page.Number; - List blocks = page.GetText("rawdict", textpage: TEXTPAGE).Blocks; + var flags = TableGlobals.FLAGS; + var textpage = page.GetTextPage(clip: clip, flags: flags); + TableGlobals.TEXTPAGE = textpage; - //List blocks = (page.GetText("rawdict", clip, flags: (int)TextFlagsExtension.TEXTFLAGS_TEXT) as PageInfo).Blocks; + var pageInfo = textpage.ExtractRAWDict(cropbox: clip, sort: false); + float doctopBase = pageHeight * page.Number; - foreach (var block in blocks) + foreach (var block in pageInfo.Blocks) { + if (block.Lines == null) continue; + foreach (var line in block.Lines) { - Point ldir = line.Dir; // = (cosine, sine) of angle - ldir = new Point((float)Math.Round(ldir.X, 4), (float)Math.Round(ldir.Y, 4)); - Matrix matrix = new Matrix(ldir.X, -ldir.Y, ldir.Y, ldir.X, 0, 0); - bool upright = ldir.Y == 0f; + var ldir = line.Dir; + var ldirRounded = Tuple.Create((float)Math.Round(ldir.X, 4), (float)Math.Round(ldir.Y, 4)); + var matrix = new Matrix(ldirRounded.Item1, -ldirRounded.Item2, ldirRounded.Item2, ldirRounded.Item1, 0, 0); + bool upright = ldirRounded.Item2 == 0; - foreach (var span in line.Spans.OrderBy(s => s.Bbox.X0)) + if (line.Spans == null) continue; + var sortedSpans = line.Spans.OrderBy(s => s.Bbox.X0).ToList(); + + foreach (var span in sortedSpans) { string fontname = span.Font; float fontsize = span.Size; - int color = span.Color; - - foreach (var character in span.Chars.OrderBy(c => c.Bbox.x0)) + bool spanBold = ((int)span.Flags & (int)FontStyle.TEXT_FONT_BOLD) != 0; + var colorInt = span.Color; + + // Extract RGB from int color (ARGB format: AARRGGBB) + // Normalize to 0-1 range for PDF color space + float r = ((colorInt >> 16) & 0xFF) / 255.0f; + float g = ((colorInt >> 8) & 0xFF) / 255.0f; + float b = (colorInt & 0xFF) / 255.0f; + + if (span.Chars == null) continue; + var sortedChars = span.Chars.OrderBy(c => c.Bbox.x0).ToList(); + + foreach (var char_ in sortedChars) { - Rect bbox = new Rect(character.Bbox); - Rect bbox_ctm = bbox * ctm; - Point origin = new Point(character.Origin) * ctm; - + var charBbox = char_.Bbox; + var bboxCtm = new Rect(charBbox) * ctm; + var origin = new Point(char_.Origin) * ctm; matrix.E = origin.X; matrix.F = origin.Y; + string text = char_.C.ToString(); - string text = character.C.ToString(); - var charDict = new Character(); - charDict.adv = upright ? bbox.X1 - bbox.X0 : bbox.Y1 - bbox.Y0; - charDict.bottom = bbox.Y1; - charDict.doctop = bbox.Y0 + doctop_base; - charDict.fontname = fontname; - charDict.height = bbox.Y1 - bbox.Y0; - charDict.matrix = matrix; - charDict.ncs = "DeviceRGB"; - charDict.non_stroking_color = color; - charDict.non_stroking_pattern = null; - charDict.object_type = "char"; - charDict.page_number = page_number; - charDict.size = upright ? fontsize : bbox.Y1 - bbox.Y0; - charDict.stroking_color = color; - charDict.stroking_pattern = null; - charDict.text = text; - charDict.top = bbox.Y0; - charDict.upright = upright; - charDict.width = bbox.X1 - bbox.X0; - charDict.x0 = bbox.X0; - charDict.x1 = bbox.X1; - charDict.y0 = bbox_ctm.Y0; - charDict.y1 = bbox_ctm.Y1; - CHARS.Add(charDict); + var charDict = new CharDict + { + adv = upright ? (charBbox.x1 - charBbox.x0) : (charBbox.y1 - charBbox.y0), + bottom = charBbox.y1, + doctop = charBbox.y0 + doctopBase, + fontname = fontname, + height = charBbox.y1 - charBbox.y0, + matrix = Tuple.Create(matrix.A, matrix.B, matrix.C, matrix.D, matrix.E, matrix.F), + ncs = "DeviceRGB", + non_stroking_color = Tuple.Create(r, g, b), + non_stroking_pattern = null, + object_type = "char", + page_number = pageNumber, + size = upright ? fontsize : (charBbox.y1 - charBbox.y0), + stroking_color = Tuple.Create(r, g, b), + stroking_pattern = null, + bold = spanBold, + text = text, + top = charBbox.y0, + upright = upright, + width = charBbox.x1 - charBbox.x0, + x0 = charBbox.x0, + x1 = charBbox.x1, + y0 = bboxCtm.Y0, + y1 = bboxCtm.Y1 + }; + + TableGlobals.CHARS.Add(charDict); } } } } - } - // ------------------------------------------------------------------------ - // Extract all page vector graphics to fill the EDGES list. - // We are ignoring Bézier curves completely and are converting everything - // else to lines. - // ------------------------------------------------------------------------ + return textpage; + } - private void make_edges(Page page, Rect clip = null, TableSettings tset = null) + // make_edges - Extract all page vector graphics to fill the EDGES list + internal static void MakeEdges( + Page page, + Rect clip = null, + TableSettings tset = null, + List paths = null, + List> addLines = null, + List addBoxes = null) { - float snap_x = tset.snap_x_tolerance; - float snap_y = tset.snap_y_tolerance; - float min_length = tset.edge_min_length; + if (tset == null) + tset = TableSettings.Resolve(); + float snapX = tset.snap_x_tolerance; + float snapY = tset.snap_y_tolerance; + float minLength = tset.edge_min_length; bool linesStrict = tset.vertical_strategy == "lines_strict" || tset.horizontal_strategy == "lines_strict"; - float page_height = page.Rect.Height; - float doctop_basis = page.Number * page_height; - int page_number = page.Number + 1; - Rect prect = page.Rect; + float pageHeight = page.Rect.Height; + float doctopBasis = page.Number * pageHeight; + int pageNumber = page.Number + 1; + var prect = page.Rect; if (page.Rotation == 90 || page.Rotation == 270) { - float w = prect.BottomRight.X; - float h = prect.BottomRight.Y; + float w = prect.Width; + float h = prect.Height; prect = new Rect(0, 0, h, w); } - if (clip != null) - clip = new Rect(clip); - else + if (clip == null) clip = prect; + else + clip = new Rect(clip.X0, clip.Y0, clip.X1, clip.Y1); - // Detect whether r1, r2 are neighbors. - // Defined as: - // The minimum distance between points of r1 and points of r2 is not - // larger than some delta. - // This check supports empty rect-likes and thus also lines. - // Note: - // This type of check is MUCH faster than native Rect containment checks. - bool are_neighbors(Rect r1, Rect r2) - { - return ( // check if x-coordinates of r1 are within those of r2 - (r2.X0 - snap_x <= r1.X0 && r1.X0 <= r2.X1 + snap_x) || - (r2.X0 - snap_x <= r1.X1 && r1.X1 <= r2.X1 + snap_x) - ) && ( - (r2.Y0 - snap_y <= r1.Y0 && r1.Y0 <= r2.Y1 + snap_y) || - (r2.Y0 - snap_y <= r1.Y1 && r1.Y1 <= r2.Y1 + snap_y) - ) || // same check with r1 / r2 exchanging their roles (this is necessary!) - ( - (r1.X0 - snap_x <= r2.X0 && r2.X0 <= r1.X1 + snap_x) || - (r1.X0 - snap_x <= r2.X1 && r2.X1 <= r1.X1 + snap_x) - ) && ( - (r1.Y0 - snap_y <= r2.Y0 && r2.Y0 <= r1.Y1 + snap_y) || - (r1.Y0 - snap_y <= r2.Y1 && r2.Y1 <= r1.Y1 + snap_y) - ); + // Helper: Check if two rects are neighbors + bool AreNeighbors(Rect r1, Rect r2) + { + if ((r2.X0 - snapX <= r1.X0 && r1.X0 <= r2.X1 + snapX || + r2.X0 - snapX <= r1.X1 && r1.X1 <= r2.X1 + snapX) && + (r2.Y0 - snapY <= r1.Y0 && r1.Y0 <= r2.Y1 + snapY || + r2.Y0 - snapY <= r1.Y1 && r1.Y1 <= r2.Y1 + snapY)) + return true; + + if ((r1.X0 - snapX <= r2.X0 && r2.X0 <= r1.X1 + snapX || + r1.X0 - snapX <= r2.X1 && r2.X1 <= r1.X1 + snapX) && + (r1.Y0 - snapY <= r2.Y0 && r2.Y0 <= r1.Y1 + snapY || + r1.Y0 - snapY <= r2.Y1 && r2.Y1 <= r1.Y1 + snapY)) + return true; + + return false; } - // Detect and join rectangles of "connected" vector graphics. - (List, List) clean_graphics() + // Helper: Clean graphics - detect and join rectangles + Tuple, List> CleanGraphics(List npaths = null) { - // Detect and join rectangles of "connected" vector graphics. - List _paths = new List(); + List allpaths = npaths ?? page.GetDrawings(); + var pathsList = new List(); - foreach (var p in page.GetDrawings()) + foreach (var p in allpaths) { - // ignore fill-only graphics if they do not simulate lines, - // which means one of width or height are small. - if (p.Type == "f" && linesStrict && p.Rect.Width > snap_x && p.Rect.Height > snap_y) - { + if (linesStrict && p.Type == "f" && p.Rect.Width > snapX && p.Rect.Height > snapY) continue; - } - _paths.Add(p); - } - - // start with all vector graphics rectangles - List prects = _paths.Select(p => p.Rect) - .Distinct() - .OrderBy(r => (r.Y1, r.X0)) - .ToList(); - - List _bboxes = new List(); - foreach (var p in prects) - { - _bboxes.Add(BBox.RectToBBox(p)); - } - _bboxes = _bboxes.Distinct().ToList(); - prects.Clear(); - foreach (var b in _bboxes) - { - prects.Add(BBox.BBoxToRect(b)); + pathsList.Add(p); } - List newRects = new List(); // the final list of joined rectangles + var prects = pathsList.Select(p => p.Rect).Distinct() + .OrderBy(r => r.Y1).ThenBy(r => r.X0).ToList(); + var newRects = new List(); - // ---------------------------------------------------------------- - // Strategy: Join rectangles that "almost touch" each other. - // Extend first rectangle with any other that is a "neighbor". - // Then move it to the final list and continue with the rest. - // ---------------------------------------------------------------- - while (prects.Count > 0) // The algorithm will empty this list. + while (prects.Count > 0) { - Rect prect0 = prects[0]; // Copy of the first rectangle (performance reasons). + var prect0 = prects[0]; bool repeat = true; - while (repeat) // This loop extends the first rect in the list. + while (repeat) { - repeat = false; // Set to true again if some other rect touches. - - for (int i = prects.Count - 1; i > 0; i--) // Run backwards. + repeat = false; + for (int i = prects.Count - 1; i > 0; i--) { - if (are_neighbors(prect0, prects[i])) // Close enough to rect 0? + if (AreNeighbors(prect0, prects[i])) { - // Extend rect 0. - prect0.X0 = Math.Min(prect0.X0, prects[i].X0); - prect0.Y0 = Math.Min(prect0.Y0, prects[i].Y0); - prect0.X1 = Math.Max(prect0.X1, prects[i].X1); - prect0.Y1 = Math.Max(prect0.Y1, prects[i].Y1); - - prects.RemoveAt(i); // Delete this rect. - repeat = true; // Keep checking the rest. + prect0 = prect0 | prects[i]; + prects.RemoveAt(i); + repeat = true; } } } - // Move rect 0 over to the result list if there is some text in it. - if (!string.IsNullOrWhiteSpace(page.GetTextbox(prect0, textPage: TEXTPAGE))) - { - // Contains text, so accept it as a table bbox candidate. + if (TableHelpers.CharsInRect(TableGlobals.CHARS, prect0)) newRects.Add(prect0); - } - prects.RemoveAt(0); // Remove from rect list. + prects.RemoveAt(0); } - return (newRects, _paths); + return Tuple.Create(newRects, pathsList); } - (List bboxes, List paths) = clean_graphics(); + var (bboxes, cleanedPaths) = CleanGraphics(paths); + // Helper: Check if line is roughly axis-parallel bool IsParallel(Point p1, Point p2) { - if (p1 == null || p2 == null) - { - return false; - } - // Check if the line is roughly parallel to either the X or Y axis - if (Math.Abs(p1.X - p2.X) <= snap_x || Math.Abs(p1.Y - p2.Y) <= snap_y) - { - return true; - } - return false; + return Math.Abs(p1.X - p2.X) <= snapX || Math.Abs(p1.Y - p2.Y) <= snapY; } - // Given 2 points, make a line dictionary for table detection. - Edge make_line(PathInfo p, Point p1, Point p2, Rect _clip) + // Helper: Make line dictionary + Dictionary MakeLine(PathInfo p, Point p1, Point p2, Rect clipRect) { - if (!IsParallel(p1, p2)) // only accepting axis-parallel lines - { + if (!IsParallel(p1, p2)) return null; - } - // Compute the extremal values float x0 = Math.Min(p1.X, p2.X); float x1 = Math.Max(p1.X, p2.X); float y0 = Math.Min(p1.Y, p2.Y); float y1 = Math.Max(p1.Y, p2.Y); - // Check for outside _clip - if (x0 > _clip.X1 || x1 < _clip.X0 || y0 > _clip.Y1 || y1 < _clip.Y0) - { + if (x0 > clipRect.X1 || x1 < clipRect.X0 || y0 > clipRect.Y1 || y1 < clipRect.Y0) return null; - } - - if (x0 < _clip.X0) x0 = _clip.X0; // Adjust to _clip boundary - if (x1 > _clip.X1) x1 = _clip.X1; // Adjust to _clip boundary - if (y0 < _clip.Y0) y0 = _clip.Y0; // Adjust to _clip boundary - if (y1 > _clip.Y1) y1 = _clip.Y1; // Adjust to _clip boundary - float width = x1 - x0; // From adjusted values - float height = y1 - y0; // From adjusted values + if (x0 < clipRect.X0) x0 = clipRect.X0; + if (x1 > clipRect.X1) x1 = clipRect.X1; + if (y0 < clipRect.Y0) y0 = clipRect.Y0; + if (y1 > clipRect.Y1) y1 = clipRect.Y1; + float width = x1 - x0; + float height = y1 - y0; if (width == 0 && height == 0) - { - return null; // Nothing left to deal with - } - - Edge line_dict = new Edge(); - line_dict.x0 = x0; - line_dict.y0 = page_height - y0; - line_dict.x1 = x1; - line_dict.y1 = page_height - y1; - line_dict.width = width; - line_dict.height = height; - line_dict.pts = new Point[] { new Point(x0, y0), new Point(x1, y1) }; - line_dict.linewidth = p.Width; - line_dict.stroke = true; - line_dict.fill = false; - line_dict.evenodd = false; - line_dict.stroking_color = (p.Color != null && p.Color.Length > 0) ? p.Color : p.Fill; - line_dict.non_stroking_color = null; - line_dict.object_type = "line"; - line_dict.page_number = page_number; - line_dict.stroking_pattern = null; - line_dict.non_stroking_pattern = null; - line_dict.top = y0; - line_dict.bottom = y1; - line_dict.doctop = y0 + doctop_basis; + return null; - return line_dict; + return new Dictionary + { + { "x0", x0 }, + { "y0", pageHeight - y0 }, + { "x1", x1 }, + { "y1", pageHeight - y1 }, + { "width", width }, + { "height", height }, + { "pts", new List { new List { x0, y0 }, new List { x1, y1 } } }, + { "linewidth", p.Width }, + { "stroke", true }, + { "fill", false }, + { "evenodd", false }, + { "stroking_color", p.Color ?? p.Fill }, + { "non_stroking_color", null }, + { "object_type", "line" }, + { "page_number", pageNumber }, + { "stroking_pattern", null }, + { "non_stroking_pattern", null }, + { "top", y0 }, + { "bottom", y1 }, + { "doctop", y0 + doctopBasis } + }; } - foreach (PathInfo p in paths) + // Process paths + foreach (var p in cleanedPaths) { - List items = p.Items; // items in this path + if (p.Items == null) continue; + + var items = new List(p.Items); - // if 'closePath', add a line from last to first point - if (p.ClosePath && items.First().Type == "l" && items.Last().Type == "l") + // If closePath, add line from last to first point + if (p.ClosePath && items.Count > 0 && items[0].Type == "l" && items[items.Count - 1].Type == "l") { - Item line = new Item() + var lastItem = items[items.Count - 1]; + var firstItem = items[0]; + if (lastItem.P2 != null && firstItem.P1 != null) { - Type = "l", - LastPoint = new Point(items.First().P1), - P1 = new Point(items.Last().LastPoint) - }; - items.Add(line); + items.Add(new Item + { + Type = "l", + P1 = lastItem.P2, + P2 = firstItem.P1 + }); + } } - foreach (Item item in items) + foreach (var item in items) { - if (item.Type != "l" && item.Type != "re" && item.Type != "qu") // ignore anything else - continue; - - if (item.Type == "l") // a line + if (item.Type == "l") // Line { - var p1 = item.P1; - var p2 = item.P2; - var lineDict = make_line(p, p1, p2, clip); - if (lineDict != null) + if (item.P1 != null && item.LastPoint != null) { - EDGES.Add(Global.line_to_edge(lineDict)); + var lineDict = MakeLine(p, item.P1, item.LastPoint, clip); + if (lineDict != null) + { + var edge = EdgeProcessing.LineToEdge(lineDict); + TableGlobals.EDGES.Add(edge); + } } } - else if (item.Type == "re") + else if (item.Type == "re" && item.Rect != null) // Rectangle { - // A rectangle: decompose into 4 lines - Rect rect = item.Rect; // Normalize the rectangle + var rect = item.Rect; rect.Normalize(); - // If it simulates a vertical line - if (rect.Width <= min_length && rect.Width < rect.Height) + // Check if simulates a vertical line + if (rect.Width <= minLength && rect.Width < rect.Height) { - float x = (rect.X1 + rect.X0) / 2; - Point p1 = new Point(x, rect.Y0); - Point p2 = new Point(x, rect.Y1); - var lineDict = make_line(p, p1, p2, clip); + float x = Math.Abs(rect.X1 + rect.X0) / 2; + var p1 = new Point(x, rect.Y0); + var p2 = new Point(x, rect.Y1); + var lineDict = MakeLine(p, p1, p2, clip); if (lineDict != null) { - EDGES.Add(line_to_edge(lineDict)); + var edge = EdgeProcessing.LineToEdge(lineDict); + TableGlobals.EDGES.Add(edge); } continue; } - // If it simulates a horizontal line - if (rect.Height <= min_length && rect.Height < rect.Width) + // Check if simulates a horizontal line + if (rect.Height <= minLength && rect.Height < rect.Width) { - float y = (rect.Y1 + rect.Y0) / 2; + float y = Math.Abs(rect.Y1 + rect.Y0) / 2; var p1 = new Point(rect.X0, y); var p2 = new Point(rect.X1, y); - var lineDict = make_line(p, p1, p2, clip); + var lineDict = MakeLine(p, p1, p2, clip); if (lineDict != null) { - EDGES.Add(line_to_edge(lineDict)); + var edge = EdgeProcessing.LineToEdge(lineDict); + TableGlobals.EDGES.Add(edge); } continue; } - var line_dict = make_line(p, rect.TopLeft, rect.BottomLeft, clip); - if (line_dict != null) - EDGES.Add(line_to_edge(line_dict)); - line_dict = make_line(p, rect.BottomLeft, rect.BottomRight, clip); - if (line_dict != null) - EDGES.Add(line_to_edge(line_dict)); - line_dict = make_line(p, rect.BottomRight, rect.TopRight, clip); - if (line_dict != null) - EDGES.Add(line_to_edge(line_dict)); - line_dict = make_line(p, rect.TopRight, rect.TopLeft, clip); - if (line_dict != null) - EDGES.Add(line_to_edge(line_dict)); + // Decompose rectangle into 4 lines + var tl = new Point(rect.X0, rect.Y0); + var tr = new Point(rect.X1, rect.Y0); + var bl = new Point(rect.X0, rect.Y1); + var br = new Point(rect.X1, rect.Y1); + + var lineDict1 = MakeLine(p, tl, bl, clip); + if (lineDict1 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict1)); + + var lineDict2 = MakeLine(p, bl, br, clip); + if (lineDict2 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict2)); + + var lineDict3 = MakeLine(p, br, tr, clip); + if (lineDict3 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict3)); + + var lineDict4 = MakeLine(p, tr, tl, clip); + if (lineDict4 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict4)); } - else // must be a quad (quads have 4 points) + else if (item.Type == "qu" && item.Quad != null) // Quad { - Point ul = item.Quad.UpperLeft; - Point ur = item.Quad.UpperRight; - Point ll = item.Quad.LowerLeft; - Point lr = item.Quad.LowerRight; + var quad = item.Quad; + var ul = quad.UpperLeft; + var ur = quad.UpperRight; + var ll = quad.LowerLeft; + var lr = quad.LowerRight; - var lineDict = make_line(p, ul, ll, clip); - if (lineDict != null) - { - EDGES.Add(line_to_edge(lineDict)); - } + var lineDict1 = MakeLine(p, ul, ll, clip); + if (lineDict1 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict1)); - lineDict = make_line(p, ll, lr, clip); - if (lineDict != null) - { - EDGES.Add(line_to_edge(lineDict)); - } + var lineDict2 = MakeLine(p, ll, lr, clip); + if (lineDict2 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict2)); - lineDict = make_line(p, lr, ur, clip); - if (lineDict != null) - { - EDGES.Add(line_to_edge(lineDict)); - } + var lineDict3 = MakeLine(p, lr, ur, clip); + if (lineDict3 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict3)); - lineDict = make_line(p, ur, ul, clip); - if (lineDict != null) - { - EDGES.Add(line_to_edge(lineDict)); - } + var lineDict4 = MakeLine(p, ur, ul, clip); + if (lineDict4 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict4)); } } } - // Define the path with color, fill, and width - PathInfo path = new PathInfo(); - path.Color = new float[] { 0f, 0f, 0f }; - path.Fill = null; - path.Width = 1f; - - foreach (Rect bbox in bboxes) + // Add border lines for all enveloping bboxes + var defaultPath = new PathInfo { Color = new float[] { 0, 0, 0 }, Fill = null, Width = 1 }; + foreach (var bbox in bboxes) { - var lineDict = make_line(path, bbox.TopLeft, bbox.TopRight, clip); - if (lineDict != null) - EDGES.Add(line_to_edge(lineDict)); + var tl = new Point(bbox.X0, bbox.Y0); + var tr = new Point(bbox.X1, bbox.Y0); + var bl = new Point(bbox.X0, bbox.Y1); + var br = new Point(bbox.X1, bbox.Y1); - lineDict = make_line(path, bbox.BottomLeft, bbox.BottomRight, clip); - if (lineDict != null) - EDGES.Add(line_to_edge(lineDict)); + var lineDict1 = MakeLine(defaultPath, tl, tr, clip); + if (lineDict1 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict1)); - lineDict = make_line(path, bbox.TopLeft, bbox.BottomLeft, clip); - if (lineDict != null) - EDGES.Add(line_to_edge(lineDict)); - - lineDict = make_line(path, bbox.TopRight, bbox.BottomRight, clip); - if (lineDict != null) - EDGES.Add(line_to_edge(lineDict)); - } + var lineDict2 = MakeLine(defaultPath, bl, br, clip); + if (lineDict2 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict2)); - return; - } + var lineDict3 = MakeLine(defaultPath, tl, bl, clip); + if (lineDict3 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict3)); - public static List
FindTables( - Page paramPage, - Rect clip, - TableSettings tset - ) - { - Page page = new Page(paramPage.GetPdfPage(), paramPage.Parent); + var lineDict4 = MakeLine(defaultPath, tr, br, clip); + if (lineDict4 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict4)); + } - if (page.Rotation != 0) + // Add user-specified lines + if (addLines != null) { - page = page_rotation_set0(page); + foreach (var (p1, p2) in addLines) + { + var lineDict = MakeLine(defaultPath, p1, p2, clip); + if (lineDict != null) + TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict)); + } } - TableFinder tableFinder = new TableFinder(paramPage, clip, tset); + // Add user-specified boxes + if (addBoxes != null) + { + foreach (var box in addBoxes) + { + var tl = new Point(box.X0, box.Y0); + var tr = new Point(box.X1, box.Y0); + var bl = new Point(box.X0, box.Y1); + var br = new Point(box.X1, box.Y1); + + var lineDict1 = MakeLine(defaultPath, tl, bl, clip); + if (lineDict1 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict1)); - return tableFinder.tables; + var lineDict2 = MakeLine(defaultPath, bl, br, clip); + if (lineDict2 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict2)); + + var lineDict3 = MakeLine(defaultPath, br, tr, clip); + if (lineDict3 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict3)); + + var lineDict4 = MakeLine(defaultPath, tr, tl, clip); + if (lineDict4 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict4)); + } + } } } } diff --git a/MuPDF.NET/TextPage.cs b/MuPDF.NET/TextPage.cs index b603815..aad7d6b 100644 --- a/MuPDF.NET/TextPage.cs +++ b/MuPDF.NET/TextPage.cs @@ -22,28 +22,40 @@ static TextPage() /// /// Rect of Stext Page /// + private FzRect _mediaBox = null; private FzRect MediaBox { - get { return new FzRect(_nativeTextPage.m_internal.mediabox); } + get { + if (_mediaBox == null) + { + _mediaBox = new FzRect(_nativeTextPage.m_internal.mediabox); + } + return _mediaBox; + } } /// /// Block List of Text /// + private List _blocks = null; public List Blocks { get { - List blocks = new List(); - for ( - fz_stext_block block = _nativeTextPage.m_internal.first_block; - block != null; - block = block.next - ) + if (_blocks == null) { - blocks.Add(new FzStextBlock(block)); + List blocks = new List(); + for ( + fz_stext_block block = _nativeTextPage.m_internal.first_block; + block != null; + block = block.next + ) + { + blocks.Add(new FzStextBlock(block)); + } + _blocks = blocks; } - return blocks; + return _blocks; } } @@ -1138,7 +1150,7 @@ internal void MakeTextPage2Dict(PageInfo pageDict, bool raw) blockDict.Size = mupdf.mupdf.fz_image_size(image); blockDict.Image = Utils.BinFromBuffer(buf); } - else + else if (block.m_internal.type == (int)STextBlockType.FZ_STEXT_BLOCK_TEXT) { List lineList = new List(); diff --git a/MuPDF.NET/Utils.cs b/MuPDF.NET/Utils.cs index 2161bf8..ff39097 100644 --- a/MuPDF.NET/Utils.cs +++ b/MuPDF.NET/Utils.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Generic; using System.Data; using System.Diagnostics; @@ -19,7 +19,6 @@ using Newtonsoft.Json; using SkiaSharp; using static System.Net.Mime.MediaTypeNames; -using static MuPDF.NET.Global; namespace MuPDF.NET { @@ -1623,40 +1622,47 @@ public static List
GetTables( List add_lines = null ) { - if (strategy != null) - { - vertical_strategy = strategy; - horizontal_strategy = strategy; - } - - Dictionary settings = new Dictionary - { - { "vertical_strategy", vertical_strategy }, - { "horizontal_strategy", horizontal_strategy }, - { "explicit_vertical_lines", vertical_lines }, - { "explicit_horizontal_lines", horizontal_lines }, - { "snap_tolerance", snap_tolerance }, - { "snap_x_tolerance", snap_x_tolerance }, - { "snap_y_tolerance", snap_y_tolerance }, - { "join_tolerance", join_tolerance }, - { "join_x_tolerance", join_x_tolerance }, - { "join_y_tolerance", join_y_tolerance }, - { "edge_min_length", edge_min_length }, - { "min_words_vertical", min_words_vertical }, - { "min_words_horizontal", min_words_horizontal }, - { "intersection_tolerance", intersection_tolerance }, - { "intersection_x_tolerance", intersection_x_tolerance }, - { "intersection_y_tolerance", intersection_y_tolerance }, - { "text_tolerance", text_tolerance }, - { "text_x_tolerance", text_x_tolerance }, - { "text_y_tolerance", text_y_tolerance } - }; + // Convert List to List for FindTables + List verticalLinesObj = vertical_lines?.Cast().ToList(); + List horizontalLinesObj = horizontal_lines?.Cast().ToList(); + + // Note: add_lines parameter in GetTables is List (text lines), but FindTables expects + // List> (geometric line segments). Since Line is a text line structure + // and not a geometric line, we cannot properly convert it. Pass null for now. + // If geometric line segments are needed, they should be passed directly to FindTables. + List> addLinesTuple = null; + + // Call FindTables with nullable tolerances (0.0f means use default, null means UNSET) + var finder = TableFinderHelper.FindTables( + page: page, + clip: clip, + vertical_strategy: vertical_strategy, + horizontal_strategy: horizontal_strategy, + vertical_lines: verticalLinesObj, + horizontal_lines: horizontalLinesObj, + snap_tolerance: snap_tolerance, + snap_x_tolerance: snap_x_tolerance == 0.0f ? (float?)null : snap_x_tolerance, + snap_y_tolerance: snap_y_tolerance == 0.0f ? (float?)null : snap_y_tolerance, + join_tolerance: join_tolerance, + join_x_tolerance: join_x_tolerance == 0.0f ? (float?)null : join_x_tolerance, + join_y_tolerance: join_y_tolerance == 0.0f ? (float?)null : join_y_tolerance, + edge_min_length: edge_min_length, + min_words_vertical: min_words_vertical, + min_words_horizontal: min_words_horizontal, + intersection_tolerance: intersection_tolerance, + intersection_x_tolerance: intersection_x_tolerance == 0.0f ? (float?)null : intersection_x_tolerance, + intersection_y_tolerance: intersection_y_tolerance == 0.0f ? (float?)null : intersection_y_tolerance, + text_tolerance: text_tolerance, + text_x_tolerance: text_x_tolerance, + text_y_tolerance: text_y_tolerance, + strategy: strategy, + add_lines: addLinesTuple + ); - // Resolve settings - TableSettings tset = TableSettings.resolve(settings); + if (finder == null) + return new List
(); - List
tables = TableFinder.FindTables(page, clip, tset); - return tables; + return finder.tables; } /// diff --git a/MuPDF.NET4LLM.Test/IdentifyHeadersTest.cs b/MuPDF.NET4LLM.Test/IdentifyHeadersTest.cs new file mode 100644 index 0000000..69f8f01 --- /dev/null +++ b/MuPDF.NET4LLM.Test/IdentifyHeadersTest.cs @@ -0,0 +1,90 @@ +using System.Collections.Generic; +using MuPDF.NET; +using MuPDF.NET4LLM.Helpers; + +namespace MuPDF.NET4LLM.Test +{ + [TestFixture] + public class IdentifyHeadersTest : LLMTestBase + { + [Test] + public void Constructor_WithValidDocument_CreatesInstance() + { + var doc = OpenTestDocument("Magazine.pdf"); + try + { + var identifyHeaders = new IdentifyHeaders(doc); + Assert.That(identifyHeaders, Is.Not.Null); + } + finally + { + doc.Close(); + } + } + + [Test] + public void Constructor_WithFilePath_CreatesInstance() + { + string filePath = GetResourcePath("Magazine.pdf"); + var identifyHeaders = new IdentifyHeaders(filePath); + Assert.That(identifyHeaders, Is.Not.Null); + } + + [Test] + public void Constructor_WithMaxLevelsOutOfRange_ThrowsException() + { + string filePath = GetResourcePath("Magazine.pdf"); + + Assert.Throws(() => + { + new IdentifyHeaders(filePath, maxLevels: 0); + }); + + Assert.Throws(() => + { + new IdentifyHeaders(filePath, maxLevels: 7); + }); + } + + [Test] + public void Constructor_WithSpecificPages_Works() + { + var doc = OpenTestDocument("Magazine.pdf"); + try + { + var identifyHeaders = new IdentifyHeaders(doc, pages: new List { 0 }); + Assert.That(identifyHeaders, Is.Not.Null); + } + finally + { + doc.Close(); + } + } + + [Test] + public void GetHeaderId_WithSmallFont_ReturnsEmpty() + { + var doc = OpenTestDocument("Magazine.pdf"); + try + { + var identifyHeaders = new IdentifyHeaders(doc); + var page = doc[0]; + + // Create a mock span with small font size + var span = new ExtendedSpan + { + Size = 10.0f, + Text = "Test" + }; + + string headerId = identifyHeaders.GetHeaderId(span, page); + // Should return empty for body text + Assert.That(headerId, Is.Not.Null); + } + finally + { + doc.Close(); + } + } + } +} diff --git a/MuPDF.NET4LLM.Test/LLMTestBase.cs b/MuPDF.NET4LLM.Test/LLMTestBase.cs new file mode 100644 index 0000000..ed00ea0 --- /dev/null +++ b/MuPDF.NET4LLM.Test/LLMTestBase.cs @@ -0,0 +1,27 @@ +using System.IO; +using MuPDF.NET; + +namespace MuPDF.NET4LLM.Test +{ + /// + /// Base class for MuPDF.NET4LLM tests + /// + public class LLMTestBase + { + protected string GetResourcePath(string relativePath) + { + // Get the test project directory + string testDir = Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location); + string projectDir = Path.GetFullPath(Path.Combine(testDir, "..", "..", "..")); + return Path.Combine(projectDir, "resources", relativePath); + } + + protected Document OpenTestDocument(string relativePath) + { + string fullPath = GetResourcePath(relativePath); + if (!File.Exists(fullPath)) + throw new FileNotFoundException($"Test resource not found: {fullPath}"); + return new Document(fullPath); + } + } +} diff --git a/MuPDF.NET4LLM.Test/MuPDF.NET4LLM.Test.csproj b/MuPDF.NET4LLM.Test/MuPDF.NET4LLM.Test.csproj new file mode 100644 index 0000000..fbc8f3e --- /dev/null +++ b/MuPDF.NET4LLM.Test/MuPDF.NET4LLM.Test.csproj @@ -0,0 +1,35 @@ + + + + net8.0 + enable + enable + + false + true + + + + + + + + + + + + + + + + + + + + + + PreserveNewest + + + + diff --git a/MuPDF.NET4LLM.Test/MuPDF4LLMTest.cs b/MuPDF.NET4LLM.Test/MuPDF4LLMTest.cs new file mode 100644 index 0000000..bd7c892 --- /dev/null +++ b/MuPDF.NET4LLM.Test/MuPDF4LLMTest.cs @@ -0,0 +1,199 @@ +using System; +using System.Collections.Generic; +using MuPDF.NET; +using MuPDF.NET4LLM; +using MuPDF.NET4LLM.Llama; + +namespace MuPDF.NET4LLM.Test +{ + [TestFixture] + public class MuPDF4LLMTest : LLMTestBase + { + [Test] + public void Version_ReturnsValidVersion() + { + string version = MuPDF4LLM.Version; + Assert.That(version, Is.Not.Null); + Assert.That(version, Is.Not.Empty); + Assert.That(version.Split('.').Length, Is.GreaterThanOrEqualTo(2)); + } + + [Test] + public void VersionTuple_ReturnsValidTuple() + { + var (major, minor, patch) = MuPDF4LLM.VersionTuple; + Assert.That(major, Is.GreaterThanOrEqualTo(0)); + Assert.That(minor, Is.GreaterThanOrEqualTo(0)); + Assert.That(patch, Is.GreaterThanOrEqualTo(0)); + } + + [Test] + public void LlamaMarkdownReader_ReturnsReader() + { + var reader = MuPDF4LLM.LlamaMarkdownReader(); + Assert.That(reader, Is.Not.Null); + Assert.That(reader, Is.InstanceOf()); + } + + [Test] + public void LlamaMarkdownReader_WithMetaFilter_ReturnsReader() + { + Func, Dictionary> filter = + (meta) => { meta["custom"] = "value"; return meta; }; + + var reader = MuPDF4LLM.LlamaMarkdownReader(filter); + Assert.That(reader, Is.Not.Null); + Assert.That(reader.MetaFilter, Is.EqualTo(filter)); + } + + [Test] + public void ToMarkdown_WithValidDocument_ReturnsMarkdown() + { + var doc = OpenTestDocument("Magazine.pdf"); + try + { + string markdown = MuPDF4LLM.ToMarkdown( + doc, + header: false, + footer: false, + showProgress: false, + useOcr: false + ); + + Assert.That(markdown, Is.Not.Null); + Assert.That(markdown, Is.Not.Empty); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithSpecificPages_ReturnsMarkdown() + { + var doc = OpenTestDocument("Magazine.pdf"); + try + { + string markdown = MuPDF4LLM.ToMarkdown( + doc, + pages: new List { 0 }, + header: false, + footer: false, + showProgress: false, + useOcr: false + ); + + Assert.That(markdown, Is.Not.Null); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WriteImagesAndEmbedImages_ThrowsException() + { + var doc = OpenTestDocument("columns.pdf"); + try + { + Assert.Throws(() => + { + MuPDF4LLM.ToMarkdown( + doc, + writeImages: true, + embedImages: true, + showProgress: false, + useOcr: false + ); + }); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToJson_WithValidDocument_ReturnsJson() + { + var doc = OpenTestDocument("columns.pdf"); + try + { + string json = MuPDF4LLM.ToJson( + doc, + showProgress: false, + useOcr: false + ); + + Assert.That(json, Is.Not.Null); + Assert.That(json, Is.Not.Empty); + Assert.That(json.TrimStart(), Does.StartWith("{")); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToText_WithValidDocument_ReturnsText() + { + var doc = OpenTestDocument("columns.pdf"); + try + { + string text = MuPDF4LLM.ToText( + doc, + header: false, + footer: false, + showProgress: false, + useOcr: false + ); + + Assert.That(text, Is.Not.Null); + Assert.That(text, Is.Not.Empty); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ParseDocument_WithValidDocument_ReturnsParsedDocument() + { + var doc = OpenTestDocument("columns.pdf"); + try + { + var parsedDoc = MuPDF4LLM.ParseDocument( + doc, + showProgress: false, + useOcr: false + ); + + Assert.That(parsedDoc, Is.Not.Null); + } + finally + { + doc.Close(); + } + } + + [Test] + public void GetKeyValues_WithNonFormPDF_ReturnsEmptyDictionary() + { + var doc = OpenTestDocument("Magazine.pdf"); + try + { + var keyValues = MuPDF4LLM.GetKeyValues(doc); + Assert.That(keyValues, Is.Not.Null); + Assert.That(keyValues, Is.Empty); + } + finally + { + doc.Close(); + } + } + } +} diff --git a/MuPDF.NET4LLM.Test/MuPdfRagTest.cs b/MuPDF.NET4LLM.Test/MuPdfRagTest.cs new file mode 100644 index 0000000..cc8fa71 --- /dev/null +++ b/MuPDF.NET4LLM.Test/MuPdfRagTest.cs @@ -0,0 +1,361 @@ +using System; +using System.Collections.Generic; +using MuPDF.NET; +using MuPDF.NET4LLM.Helpers; +using NUnit.Framework; + +namespace MuPDF.NET4LLM.Test +{ + [TestFixture] + public class MuPdfRagTest : LLMTestBase + { + [Test] + public void ToMarkdown_BasicWithDefaultSettings_ReturnsMarkdown() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: null, // All pages + hdrInfo: null, // Auto-detect headers + writeImages: false, + embedImages: false, + ignoreImages: false, + ignoreGraphics: false, + detectBgColor: true, + imagePath: "", + imageFormat: "png", + imageSizeLimit: 0.05f, + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + pageChunks: false, + pageSeparators: false, + margins: null, + dpi: 150, + pageWidth: 612, + pageHeight: null, + tableStrategy: "lines_strict", + graphicsLimit: null, + fontsizeLimit: 3.0f, + ignoreCode: false, + extractWords: false, + showProgress: false, + useGlyphs: false, + ignoreAlpha: false + ); + + Assert.That(markdown, Is.Not.Null); + Assert.That(markdown, Is.Not.Empty); + Assert.That(markdown.Length, Is.GreaterThan(0)); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithIdentifyHeaders_ReturnsMarkdown() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + var identifyHeaders = new IdentifyHeaders(doc, pages: null, bodyLimit: 12.0f, maxLevels: 6); + + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0 }, // First page only + hdrInfo: identifyHeaders, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + showProgress: false + ); + + Assert.That(markdown, Is.Not.Null); + Assert.That(markdown, Is.Not.Empty); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithTocHeaders_ReturnsMarkdown() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + var tocHeaders = new TocHeaders(doc); + + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0 }, // First page only + hdrInfo: tocHeaders, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + showProgress: false + ); + + Assert.That(markdown, Is.Not.Null); + Assert.That(markdown, Is.Not.Empty); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithPageSeparators_ReturnsMarkdown() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: null, // All pages + hdrInfo: null, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + pageSeparators: true, // Add page separators + showProgress: false + ); + + Assert.That(markdown, Is.Not.Null); + Assert.That(markdown, Is.Not.Empty); + + // Verify page separators are present + Assert.That(markdown, Does.Contain("--- end of page=")); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithSpecificPages_ReturnsMarkdown() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0, 1 }, // First two pages + hdrInfo: null, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + showProgress: false + ); + + Assert.That(markdown, Is.Not.Null); + Assert.That(markdown, Is.Not.Empty); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithInvalidImageSizeLimit_ThrowsException() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + Assert.Throws(() => + { + MuPdfRag.ToMarkdown( + doc, + imageSizeLimit: 1.5f, // Invalid: >= 1 + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + showProgress: false + ); + }); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithMargins_ReturnsMarkdown() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0 }, + hdrInfo: null, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + margins: new List { 10.0f, 20.0f, 10.0f, 20.0f }, // left, top, right, bottom + showProgress: false + ); + + Assert.That(markdown, Is.Not.Null); + Assert.That(markdown, Is.Not.Empty); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithInvalidMargins_ThrowsException() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + Assert.Throws(() => + { + MuPdfRag.ToMarkdown( + doc, + margins: new List { 10.0f, 20.0f, 30.0f }, // Invalid: 3 elements + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + showProgress: false + ); + }); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithTableStrategy_ReturnsMarkdown() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0 }, + hdrInfo: null, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + tableStrategy: "lines", + showProgress: false + ); + + Assert.That(markdown, Is.Not.Null); + Assert.That(markdown, Is.Not.Empty); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithIgnoreImages_ReturnsMarkdown() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0 }, + hdrInfo: null, + writeImages: false, + embedImages: false, + ignoreImages: true, + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + showProgress: false + ); + + Assert.That(markdown, Is.Not.Null); + Assert.That(markdown, Is.Not.Empty); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithIgnoreGraphics_ReturnsMarkdown() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0 }, + hdrInfo: null, + writeImages: false, + embedImages: false, + ignoreImages: false, + ignoreGraphics: true, + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + showProgress: false + ); + + Assert.That(markdown, Is.Not.Null); + Assert.That(markdown, Is.Not.Empty); + } + finally + { + doc.Close(); + } + } + + [Test] + public void ToMarkdown_WithPageChunks_ReturnsJson() + { + var doc = OpenTestDocument("national-capitals.pdf"); + try + { + string result = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0 }, + hdrInfo: null, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: GetResourcePath("national-capitals.pdf"), + forceText: true, + pageChunks: true, + showProgress: false + ); + + Assert.That(result, Is.Not.Null); + Assert.That(result, Is.Not.Empty); + // In page_chunks mode, result should be JSON or structured text + Assert.That(result.Length, Is.GreaterThan(0)); + } + finally + { + doc.Close(); + } + } + } +} diff --git a/MuPDF.NET4LLM.Test/PDFMarkdownReaderTest.cs b/MuPDF.NET4LLM.Test/PDFMarkdownReaderTest.cs new file mode 100644 index 0000000..4560083 --- /dev/null +++ b/MuPDF.NET4LLM.Test/PDFMarkdownReaderTest.cs @@ -0,0 +1,163 @@ +using System; +using System.Collections.Generic; +using System.IO; +using MuPDF.NET; +using MuPDF.NET4LLM.Llama; + +namespace MuPDF.NET4LLM.Test +{ + [TestFixture] + public class PDFMarkdownReaderTest : LLMTestBase + { + [Test] + public void Constructor_WithoutMetaFilter_CreatesReader() + { + var reader = new PDFMarkdownReader(); + Assert.That(reader, Is.Not.Null); + Assert.That(reader.MetaFilter, Is.Null); + } + + [Test] + public void Constructor_WithMetaFilter_CreatesReader() + { + Func, Dictionary> filter = + (meta) => meta; + + var reader = new PDFMarkdownReader(filter); + Assert.That(reader, Is.Not.Null); + Assert.That(reader.MetaFilter, Is.EqualTo(filter)); + } + + [Test] + public void LoadData_WithNullFilePath_ThrowsArgumentNullException() + { + var reader = new PDFMarkdownReader(); + Assert.Throws(() => + { + reader.LoadData(null); + }); + } + + [Test] + public void LoadData_WithNonExistentFile_ThrowsFileNotFoundException() + { + var reader = new PDFMarkdownReader(); + Assert.Throws(() => + { + reader.LoadData("nonexistent.pdf"); + }); + } + + [Test] + public void LoadData_WithValidFile_ReturnsDocuments() + { + var reader = new PDFMarkdownReader(); + string filePath = GetResourcePath("columns.pdf"); + + var docs = reader.LoadData(filePath); + + Assert.That(docs, Is.Not.Null); + Assert.That(docs.Count, Is.GreaterThan(0)); + } + + [Test] + public void LoadData_WithValidFile_ReturnsDocumentsWithText() + { + var reader = new PDFMarkdownReader(); + string filePath = GetResourcePath("columns.pdf"); + + var docs = reader.LoadData(filePath); + + Assert.That(docs.Count, Is.GreaterThan(0)); + foreach (var doc in docs) + { + Assert.That(doc, Is.Not.Null); + Assert.That(doc.Text, Is.Not.Null); + Assert.That(doc.ExtraInfo, Is.Not.Null); + } + } + + [Test] + public void LoadData_WithExtraInfo_IncludesExtraInfo() + { + var reader = new PDFMarkdownReader(); + string filePath = GetResourcePath("columns.pdf"); + var extraInfo = new Dictionary + { + { "custom_key", "custom_value" } + }; + + var docs = reader.LoadData(filePath, extraInfo: extraInfo); + + Assert.That(docs.Count, Is.GreaterThan(0)); + Assert.That(docs[0].ExtraInfo.ContainsKey("custom_key"), Is.True); + Assert.That(docs[0].ExtraInfo["custom_key"], Is.EqualTo("custom_value")); + } + + [Test] + public void LoadData_WithMetaFilter_AppliesFilter() + { + bool filterCalled = false; + Func, Dictionary> filter = + (meta) => + { + filterCalled = true; + meta["filtered"] = true; + return meta; + }; + + var reader = new PDFMarkdownReader(filter); + string filePath = GetResourcePath("columns.pdf"); + + var docs = reader.LoadData(filePath); + + Assert.That(filterCalled, Is.True); + Assert.That(docs.Count, Is.GreaterThan(0)); + Assert.That(docs[0].ExtraInfo.ContainsKey("filtered"), Is.True); + Assert.That(docs[0].ExtraInfo["filtered"], Is.EqualTo(true)); + } + + [Test] + public void LoadData_WithLoadKwargs_RespectsKwargs() + { + var reader = new PDFMarkdownReader(); + string filePath = GetResourcePath("columns.pdf"); + var loadKwargs = new Dictionary + { + { "force_text", true }, + { "write_images", false }, + { "embed_images", false } + }; + + var docs = reader.LoadData(filePath, loadKwargs: loadKwargs); + + Assert.That(docs, Is.Not.Null); + Assert.That(docs.Count, Is.GreaterThan(0)); + } + + [Test] + public void LoadData_WithStringPath_Works() + { + var reader = new PDFMarkdownReader(); + string filePath = GetResourcePath("columns.pdf"); + + var docs = reader.LoadData(filePath); + + Assert.That(docs, Is.Not.Null); + } + + [Test] + public void LoadData_IncludesPageMetadata() + { + var reader = new PDFMarkdownReader(); + string filePath = GetResourcePath("columns.pdf"); + + var docs = reader.LoadData(filePath); + + Assert.That(docs.Count, Is.GreaterThan(0)); + Assert.That(docs[0].ExtraInfo.ContainsKey("page"), Is.True); + Assert.That(docs[0].ExtraInfo.ContainsKey("total_pages"), Is.True); + Assert.That(docs[0].ExtraInfo.ContainsKey("file_path"), Is.True); + } + } +} diff --git a/MuPDF.NET4LLM.Test/README.md b/MuPDF.NET4LLM.Test/README.md new file mode 100644 index 0000000..0a24c2a --- /dev/null +++ b/MuPDF.NET4LLM.Test/README.md @@ -0,0 +1,52 @@ +# MuPDF.NET4LLM.Test + +Unit tests for the MuPDF.NET4LLM project. + +## Test Structure + +The test project follows the same structure as `MuPDF.NET.Test` and uses NUnit as the testing framework. + +## Test Classes + +- **MuPDF4LLMTest**: Tests for the main `MuPDF4LLM` static class + - Version information + - Document conversion methods (ToMarkdown, ToJson, ToText) + - LlamaIndex reader creation + - Error handling + +- **PDFMarkdownReaderTest**: Tests for the `PDFMarkdownReader` class + - Constructor tests + - LoadData method with various parameters + - MetaFilter functionality + - Error handling + +- **UtilsTest**: Tests for utility functions + - White character detection + - Bullet character detection + - Constants validation + +- **IdentifyHeadersTest**: Tests for header identification + - Constructor with various parameters + - Header ID generation + - Error handling + +- **VersionInfoTest**: Tests for version information + - Version string validation + - Minimum MuPDF version validation + +## Running Tests + +Tests can be run using: +- Visual Studio Test Explorer +- `dotnet test` command +- NUnit Test Adapter + +## Test Resources + +Test resources (PDF files) should be placed in the `resources` directory. The `columns.pdf` file is used as a sample test document. + +## Notes + +- Tests that require OCR are disabled by default (`useOcr: false`) to avoid dependencies on OCR libraries +- Some tests may require specific PDF files in the resources directory +- Tests follow the Arrange-Act-Assert pattern diff --git a/MuPDF.NET4LLM.Test/UtilsTest.cs b/MuPDF.NET4LLM.Test/UtilsTest.cs new file mode 100644 index 0000000..7a93e4a --- /dev/null +++ b/MuPDF.NET4LLM.Test/UtilsTest.cs @@ -0,0 +1,57 @@ +using System; +using MuPDF4LLMUtils = MuPDF.NET4LLM.Helpers.Utils; + +namespace MuPDF.NET4LLM.Test +{ + [TestFixture] + public class UtilsTest + { + [Test] + public void WhiteChars_ContainsExpectedCharacters() + { + Assert.That(MuPDF4LLMUtils.WHITE_CHARS.Contains(' '), Is.True); + Assert.That(MuPDF4LLMUtils.WHITE_CHARS.Contains('\t'), Is.True); + Assert.That(MuPDF4LLMUtils.WHITE_CHARS.Contains('\n'), Is.True); + Assert.That(MuPDF4LLMUtils.WHITE_CHARS.Contains('\u00a0'), Is.True); // Non-breaking space + Assert.That(MuPDF4LLMUtils.WHITE_CHARS.Contains('a'), Is.False); + } + + [Test] + public void IsWhite_WithWhiteString_ReturnsTrue() + { + Assert.That(MuPDF4LLMUtils.IsWhite(" "), Is.True); + Assert.That(MuPDF4LLMUtils.IsWhite("\t\n"), Is.True); + Assert.That(MuPDF4LLMUtils.IsWhite("\u00a0"), Is.True); // Non-breaking space + Assert.That(MuPDF4LLMUtils.IsWhite(""), Is.True); + } + + [Test] + public void IsWhite_WithNonWhiteString_ReturnsFalse() + { + Assert.That(MuPDF4LLMUtils.IsWhite("hello"), Is.False); + Assert.That(MuPDF4LLMUtils.IsWhite(" hello "), Is.False); + Assert.That(MuPDF4LLMUtils.IsWhite("a"), Is.False); + } + + [Test] + public void Bullets_ContainsExpectedCharacters() + { + Assert.That(MuPDF4LLMUtils.BULLETS.Contains('*'), Is.True); + Assert.That(MuPDF4LLMUtils.BULLETS.Contains('-'), Is.True); + Assert.That(MuPDF4LLMUtils.BULLETS.Contains('>'), Is.True); + Assert.That(MuPDF4LLMUtils.BULLETS.Contains('o'), Is.True); + } + + [Test] + public void ReplacementCharacter_IsCorrect() + { + Assert.That(MuPDF4LLMUtils.REPLACEMENT_CHARACTER, Is.EqualTo('\uFFFD')); + } + + [Test] + public void Type3FontName_IsCorrect() + { + Assert.That(MuPDF4LLMUtils.TYPE3_FONT_NAME, Is.EqualTo("Unnamed-T3")); + } + } +} diff --git a/MuPDF.NET4LLM.Test/VersionInfoTest.cs b/MuPDF.NET4LLM.Test/VersionInfoTest.cs new file mode 100644 index 0000000..a70b859 --- /dev/null +++ b/MuPDF.NET4LLM.Test/VersionInfoTest.cs @@ -0,0 +1,24 @@ +using MuPDF.NET4LLM; + +namespace MuPDF.NET4LLM.Test +{ + [TestFixture] + public class VersionInfoTest + { + [Test] + public void Version_IsNotNull() + { + Assert.That(VersionInfo.Version, Is.Not.Null); + Assert.That(VersionInfo.Version, Is.Not.Empty); + } + + [Test] + public void MinimumMuPDFVersion_IsValid() + { + var (major, minor, patch) = VersionInfo.MinimumMuPDFVersion; + Assert.That(major, Is.GreaterThanOrEqualTo(1)); + Assert.That(minor, Is.GreaterThanOrEqualTo(0)); + Assert.That(patch, Is.GreaterThanOrEqualTo(0)); + } + } +} diff --git a/MuPDF.NET4LLM.Test/resources/Magazine.pdf b/MuPDF.NET4LLM.Test/resources/Magazine.pdf new file mode 100644 index 0000000..c8e166e Binary files /dev/null and b/MuPDF.NET4LLM.Test/resources/Magazine.pdf differ diff --git a/MuPDF.NET4LLM.Test/resources/columns.pdf b/MuPDF.NET4LLM.Test/resources/columns.pdf new file mode 100644 index 0000000..18f5f15 Binary files /dev/null and b/MuPDF.NET4LLM.Test/resources/columns.pdf differ diff --git a/MuPDF.NET4LLM.Test/resources/national-capitals.pdf b/MuPDF.NET4LLM.Test/resources/national-capitals.pdf new file mode 100644 index 0000000..d2b4721 Binary files /dev/null and b/MuPDF.NET4LLM.Test/resources/national-capitals.pdf differ diff --git a/MuPDF.NET4LLM/MuPDF.NET4LLM.csproj b/MuPDF.NET4LLM/MuPDF.NET4LLM.csproj new file mode 100644 index 0000000..7a76650 --- /dev/null +++ b/MuPDF.NET4LLM/MuPDF.NET4LLM.csproj @@ -0,0 +1,27 @@ + + + + netstandard2.0;net461;net472;net48;net5.0;net6.0;net7.0;net8.0 + AnyCPU;x64;x86 + false + $(Platform) + . + + + + WINDOWS + + + + LINUX + + + + + + + + + + + diff --git a/MuPDF.NET4LLM/MuPDF4LLM.cs b/MuPDF.NET4LLM/MuPDF4LLM.cs new file mode 100644 index 0000000..a633504 --- /dev/null +++ b/MuPDF.NET4LLM/MuPDF4LLM.cs @@ -0,0 +1,290 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using MuPDF.NET; +using MuPDF.NET4LLM.Helpers; +using MuPDF.NET4LLM.Llama; + +namespace MuPDF.NET4LLM +{ + /// + /// Main entry point for MuPDF.NET4LLM functionality. + /// Provides a C# facade over the helpers ported from the Python pymupdf4llm package. + /// + public static class MuPDF4LLM + { + public static string Version => VersionInfo.Version; + + public static (int major, int minor, int patch) VersionTuple + { + get + { + string[] parts = Version.Split('.'); + return ( + int.Parse(parts[0]), + parts.Length > 1 ? int.Parse(parts[1]) : 0, + parts.Length > 2 ? int.Parse(parts[2]) : 0 + ); + } + } + + /// + /// Get a LlamaIndex‑compatible PDF reader that uses + /// under the hood to produce Markdown text per page. + /// + public static PDFMarkdownReader LlamaMarkdownReader( + Func, Dictionary> metaFilter = null) + { + return new PDFMarkdownReader(metaFilter); + } + + /// + /// Process the document and return the text of the selected pages. + /// + /// Input to convert. + /// Include page headers in output. + /// Include page footers in output. + /// List of page numbers to consider (0-based). + /// Save images / graphics as files. + /// Embed images in markdown text (base64 encoded). + /// Store images in this folder. + /// Use this image format. Choose a supported one (e.g. "png", "jpg"). + /// Logical filename used in image names and metadata. + /// Output text despite of image background. + /// Whether to segment output by page. + /// Whether to include page separators in output. + /// Desired resolution for generated images. + /// DPI for OCR operations. + /// Assumption if page layout is variable (reflowable documents). + /// Assumption if page layout is variable (reflowable documents). If null, a single tall page is created. + /// Suppress code-like formatting (mono-space fonts). + /// Print progress as each page is processed. + /// If beneficial invoke OCR. + /// Language for OCR. + public static string ToMarkdown( + Document doc, + bool header = true, + bool footer = true, + List pages = null, + bool writeImages = false, + bool embedImages = false, + string imagePath = "", + string imageFormat = "png", + string filename = "", + bool forceText = true, + bool pageChunks = false, + bool pageSeparators = false, + int dpi = 150, + int ocrDpi = 400, + float pageWidth = 612, + float? pageHeight = null, + bool ignoreCode = false, + bool showProgress = false, + bool useOcr = true, + string ocrLanguage = "eng") + { + if (writeImages && embedImages) + throw new ArgumentException("Cannot both write_images and embed_images"); + + var parsedDoc = Helpers.DocumentLayout.ParseDocument( + doc, + filename: filename, + imageDpi: dpi, + imageFormat: imageFormat, + imagePath: imagePath, + pages: pages, + ocrDpi: ocrDpi, + writeImages: writeImages, + embedImages: embedImages, + showProgress: showProgress, + forceText: forceText, + useOcr: useOcr, + ocrLanguage: ocrLanguage); + + return parsedDoc.ToMarkdown( + header: header, + footer: footer, + writeImages: writeImages, + embedImages: embedImages, + ignoreCode: ignoreCode, + showProgress: showProgress, + pageSeparators: pageSeparators, + pageChunks: pageChunks); + } + + /// + /// High‑level helper to convert a to a JSON representation + /// of its layout (pages, boxes, metadata). Wraps + /// and + /// . + /// + /// Input to convert. + /// Desired resolution for generated images. + /// Use this image format. + /// Store images in this folder. + /// List of page numbers to consider (0-based). + /// DPI for OCR operations. + /// Save images / graphics as files. + /// Embed images in JSON (base64 encoded). + /// Print progress as each page is processed. + /// Output text despite of image background. + /// If beneficial invoke OCR. + /// Language for OCR. + public static string ToJson( + Document doc, + int imageDpi = 150, + string imageFormat = "png", + string imagePath = "", + List pages = null, + int ocrDpi = 400, + bool writeImages = false, + bool embedImages = false, + bool showProgress = false, + bool forceText = true, + bool useOcr = true, + string ocrLanguage = "eng") + { + var parsedDoc = Helpers.DocumentLayout.ParseDocument( + doc, + filename: doc.Name, + imageDpi: imageDpi, + ocrDpi: ocrDpi, + imageFormat: imageFormat, + imagePath: imagePath, + pages: pages, + showProgress: showProgress, + embedImages: embedImages, + writeImages: writeImages, + forceText: forceText, + useOcr: useOcr, + ocrLanguage: ocrLanguage); + + return parsedDoc.ToJson(); + } + + /// + /// High‑level helper to convert a to plain text, using the + /// same layout analysis as the Markdown conversion but omitting Markdown syntax. + /// Wraps and + /// . + /// + /// Input to convert. + /// Logical filename used in metadata. + /// Include page headers in output. + /// Include page footers in output. + /// List of page numbers to consider (0-based). + /// Suppress code-like formatting. + /// Print progress as each page is processed. + /// Output text despite of image background. + /// DPI for OCR operations. + /// If beneficial invoke OCR. + /// Language for OCR. + /// Table format for text output (e.g. "grid"). + /// Whether to segment output by page. + public static string ToText( + Document doc, + string filename = "", + bool header = true, + bool footer = true, + List pages = null, + bool ignoreCode = false, + bool showProgress = false, + bool forceText = true, + int ocrDpi = 400, + bool useOcr = true, + string ocrLanguage = "eng", + string tableFormat = "grid", + bool pageChunks = false) + { + var parsedDoc = Helpers.DocumentLayout.ParseDocument( + doc, + filename: filename, + pages: pages, + embedImages: false, + writeImages: false, + showProgress: showProgress, + forceText: forceText, + useOcr: useOcr, + ocrLanguage: ocrLanguage); + + return parsedDoc.ToText( + header: header, + footer: footer, + ignoreCode: ignoreCode, + showProgress: showProgress, + tableFormat: tableFormat, + pageChunks: pageChunks); + } + + /// + /// Parse the logical layout of a into a + /// object, exposing pages, layout boxes, + /// tables, images and metadata. This is the C# equivalent of the Python + /// parse_document helper and is the common basis for Markdown / JSON / text output. + /// + /// Input to convert. + /// Logical filename used in metadata. + /// Desired resolution for generated images. + /// Use this image format. + /// Store images in this folder. + /// DPI for OCR operations. + /// List of page numbers to consider (0-based). + /// Save images / graphics as files. + /// Embed images (base64 encoded). + /// Print progress as each page is processed. + /// Output text despite of image background. + /// If beneficial invoke OCR. + /// Language for OCR. + public static Helpers.ParsedDocument ParseDocument( + Document doc, + string filename = "", + int imageDpi = 150, + string imageFormat = "png", + string imagePath = "", + int ocrDpi = 400, + List pages = null, + bool writeImages = false, + bool embedImages = false, + bool showProgress = false, + bool forceText = true, + bool useOcr = true, + string ocrLanguage = "eng") + { + return Helpers.DocumentLayout.ParseDocument( + doc, + filename: filename, + imageDpi: imageDpi, + imageFormat: imageFormat, + imagePath: imagePath, + ocrDpi: ocrDpi, + pages: pages, + writeImages: writeImages, + embedImages: embedImages, + showProgress: showProgress, + forceText: forceText, + useOcr: useOcr, + ocrLanguage: ocrLanguage); + } + + /// + /// Extract key / value information from interactive form fields, including + /// the pages each field appears on, similar to the Python + /// utils.extract_form_fields_with_pages helper. + /// Traverse /AcroForm/Fields hierarchy and return a dict: + /// fully qualified field name -> {"value": ..., "pages": [...]} + /// Optionally, the xref of the field is included. + /// + /// Input . + /// Include the xref of the field. + public static Dictionary> GetKeyValues( + Document doc, + bool xrefs = false) + { + if (doc.IsFormPDF != 0) + { + return Helpers.Utils.ExtractFormFieldsWithPages(doc, xrefs); + } + return new Dictionary>(); + } + } +} diff --git a/MuPDF.NET4LLM/VersionInfo.cs b/MuPDF.NET4LLM/VersionInfo.cs new file mode 100644 index 0000000..fd4eba9 --- /dev/null +++ b/MuPDF.NET4LLM/VersionInfo.cs @@ -0,0 +1,12 @@ +namespace MuPDF.NET4LLM +{ + /// + /// Version information for MuPDF.NET4LLM + /// Generated file - do not edit. + /// + public static class VersionInfo + { + public static readonly (int Major, int Minor, int Patch) MinimumMuPDFVersion = (1, 27, 0); + public const string Version = "0.2.9"; + } +} diff --git a/MuPDF.NET4LLM/helpers/CheckOcr.cs b/MuPDF.NET4LLM/helpers/CheckOcr.cs new file mode 100644 index 0000000..465c686 --- /dev/null +++ b/MuPDF.NET4LLM/helpers/CheckOcr.cs @@ -0,0 +1,313 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using MuPDF.NET; +using mupdf; +using Char = MuPDF.NET.Char; + +namespace MuPDF.NET4LLM.Helpers +{ + /// + /// OCR decision and repair utilities. + /// Ported and adapted from the Python module helpers/check_ocr.py in pymupdf4llm. + /// + public static class CheckOcr + { + public static int FLAGS = (int)( + mupdf.mupdf.FZ_STEXT_COLLECT_STYLES | + mupdf.mupdf.FZ_STEXT_COLLECT_VECTORS | + (int)TextFlags.TEXT_PRESERVE_IMAGES | + (int)TextFlags.TEXT_ACCURATE_BBOXES + // | mupdf.mupdf.FZ_STEXT_MEDIABOX_CLIP + ); + + /// + /// Return OCR'd span text using Tesseract. + /// + /// MuPDF Page + /// MuPDF Rect or its sequence + /// Resolution for OCR image + /// The OCR-ed text of the bbox. + public static string GetSpanOcr(Page page, Rect bbox, int dpi = 300) + { + // Step 1: Make a high-resolution image of the bbox. + Pixmap pix = page.GetPixmap(dpi: dpi, clip: bbox); + byte[] ocrPdfBytes = pix.PdfOCR2Bytes(true); + + Document ocrPdf = new Document("pdf", ocrPdfBytes); + Page ocrPage = ocrPdf.LoadPage(0); + string text = ocrPage.GetText(); + text = text.Replace("\n", " ").Trim(); // Get rid of line breaks + + ocrPage.Dispose(); + ocrPdf.Close(); + pix.Dispose(); + + return text; + } + + /// + /// Repair text blocks with missing glyphs using OCR. + /// + /// TODO: Support non-linear block structure. + /// + public static List RepairBlocks(List inputBlocks, Page page, int dpi = 300) + { + List repairedBlocks = new List(); + + foreach (var block in inputBlocks) + { + if (block.Type != 0) // Accept non-text blocks as is + { + repairedBlocks.Add(block); + continue; + } + + if (block.Lines != null) + { + foreach (var line in block.Lines) + { + if (line.Spans != null) + { + foreach (var span in line.Spans) + { + string spanText = ""; + if (span.Chars != null && span.Chars.Count > 0) + { + spanText = string.Join("", span.Chars.Select(c => c.C)); + } + else + { + spanText = span.Text ?? ""; + } + + if (!spanText.Contains(Utils.REPLACEMENT_CHARACTER)) + continue; + + int spanTextLen = spanText.Length; + string newText = GetSpanOcr(page, span.Bbox, dpi); + if (newText.Length > spanTextLen) + newText = newText.Substring(0, spanTextLen); + + if (span.Chars != null && span.Chars.Count > 0) + { + // Rebuild chars array + List newChars = new List(); + int minLen = Math.Min(newText.Length, span.Chars.Count); + for (int i = 0; i < minLen; i++) + { + Char oldChar = span.Chars[i]; + Char newChar = new Char + { + C = newText[i], + Origin = oldChar.Origin, + Bbox = oldChar.Bbox, + // Copy other properties as needed + }; + newChars.Add(newChar); + } + span.Chars = newChars; + } + else + { + span.Text = newText; + } + } + } + } + } + repairedBlocks.Add(block); + } + + return repairedBlocks; + } + + /// + /// Determine whether the page contains text worthwhile to OCR. + /// + /// MuPDF.NET Page object + /// DPI used for rasterization *if* we decide to OCR + /// Area to consider for text presence + /// + /// The full-page transformation matrix, the full-page pixmap and a + /// boolean indicating whether the page is photo-like (True) or + /// text-like (False). + /// + public static (Matrix matrix, Pixmap pix, bool photo) GetPageImage( + Page page, + int dpi = 150, + Rect covered = null) + { + if (covered == null) + covered = page.Rect; + + IRect irect = new IRect((int)covered.X0, (int)covered.Y0, + (int)covered.X1, (int)covered.Y1); + + // Make a gray pixmap of the covered area + Rect clipRect = new Rect(covered); + Pixmap pixCovered = page.GetPixmap(colorSpace: "gray", clip: clipRect); + + // Convert to byte array for image quality analysis (convert to numpy array) + int width = pixCovered.W; + int height = pixCovered.H; + byte[] samples = pixCovered.SAMPLES; + + // Create 2D array for image quality analysis + byte[,] gray = new byte[height, width]; + int sampleIndex = 0; + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + gray[y, x] = samples[sampleIndex++]; + } + } + + // Run photo checks + var scores = ImageQuality.AnalyzeImage(gray); + double score = scores.ContainsKey("score") ? scores["score"].value : 0; + + if (score >= 3) + { + pixCovered.Dispose(); + return (new Matrix(1, 0, 0, 1, 0, 0), null, true); // Identity matrix + } + else + { + Pixmap pix = page.GetPixmap(dpi: dpi); + IRect pixRect = new IRect(0, 0, pix.W, pix.H); + Matrix matrix = new Matrix( + page.Rect.Width / pix.W, + 0, + 0, + page.Rect.Height / pix.H, + 0, + 0 + ); + pixCovered.Dispose(); + return (matrix, pix, false); + } + } + + /// + /// Decide whether a MuPDF.NET page should be OCR'd. + /// + /// MuPDF.NET page object + /// DPI used for rasterization + /// Minimum number of vector paths to suggest glyph simulation + /// Fraction of page area covered by images to trigger OCR + /// Fraction of readable characters to skip OCR + /// Output of page.get_text("dict") if already available + /// Dictionary with decision and diagnostic flags + public static Dictionary ShouldOcrPage( + Page page, + int dpi = 150, + float vectorThresh = 0.9f, + float imageCoverageThresh = 0.9f, + float textReadabilityThresh = 0.9f, + List blocks = null) + { + var decision = new Dictionary + { + ["should_ocr"] = false, + ["has_ocr_text"] = false, + ["has_text"] = false, + ["readable_text"] = false, + ["image_covers_page"] = false, + ["has_vector_chars"] = false, + ["transform"] = new Matrix(1, 0, 0, 1, 0, 0), // Identity matrix + ["pixmap"] = null, + }; + + Rect pageRect = page.Rect; + float pageArea = Math.Abs(pageRect.Width * pageRect.Height); + + // Analyze the page + var analysis = Utils.AnalyzePage(page, blocks); + + // Return if page is completely blank + Rect covered = analysis["covered"] as Rect; + if (Utils.BboxIsEmpty(covered)) + { + decision["should_ocr"] = false; + return decision; + } + + // Return if page has been OCR'd already + int ocrSpans = (int)analysis["ocr_spans"]; + if (ocrSpans > 0) + { + decision["has_ocr_text"] = true; + decision["should_ocr"] = false; + return decision; + } + + float txtArea = (float)analysis["txt_area"]; + int charsTotal = (int)analysis["chars_total"]; + float txtJoins = (float)analysis["txt_joins"]; + float vecArea = (float)analysis["vec_area"]; + float imgArea = (float)analysis["img_area"]; + int charsBad = (int)analysis["chars_bad"]; + + // Preset OCR if very little text area exists + // Less than 5% text area in covered area + if (txtArea < 0.05f && charsTotal < 200 && txtJoins < 0.3f) + { + if (vecArea >= vectorThresh) + { + decision["should_ocr"] = true; + decision["has_vector_chars"] = true; + } + if (imgArea >= imageCoverageThresh) + { + decision["should_ocr"] = true; + decision["image_covers_page"] = true; + } + } + else if (charsTotal >= 200) + { + decision["has_text"] = true; + float readability = 1.0f - (float)charsBad / charsTotal; + if (readability >= textReadabilityThresh) + { + decision["readable_text"] = true; + decision["should_ocr"] = false; + } + else + { + decision["readable_text"] = false; + decision["should_ocr"] = true; + } + } + + if (!(bool)decision["should_ocr"]) + return decision; + + if (!(bool)decision["readable_text"] && (bool)decision["has_text"]) + return decision; + + // We need OCR and do a final check for potential text presence + if (!(bool)decision["has_text"]) + { + // Rasterize and check for photo versus text-heaviness + var (matrix, pix, photo) = GetPageImage(page, dpi, covered); + + if (photo) + { + // This seems to be a non-text picture page + decision["should_ocr"] = false; + decision["pixmap"] = null; + } + else + { + decision["should_ocr"] = true; + decision["transform"] = matrix; + decision["pixmap"] = pix; + } + } + + return decision; + } + } +} diff --git a/MuPDF.NET4LLM/helpers/DocumentLayout.cs b/MuPDF.NET4LLM/helpers/DocumentLayout.cs new file mode 100644 index 0000000..3391fa9 --- /dev/null +++ b/MuPDF.NET4LLM/helpers/DocumentLayout.cs @@ -0,0 +1,807 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using MuPDF.NET; +using Newtonsoft.Json; + +namespace MuPDF.NET4LLM.Helpers +{ + /// + /// Layout box representing a content region on a page + /// + public class LayoutBox + { + public float X0 { get; set; } + public float Y0 { get; set; } + public float X1 { get; set; } + public float Y1 { get; set; } + public string BoxClass { get; set; } // e.g. 'text', 'picture', 'table', etc. + // If boxclass == 'picture' or 'formula', store image bytes + public byte[] Image { get; set; } + // If boxclass == 'table' + public Dictionary Table { get; set; } + // Text line information for text-type boxclasses + public List TextLines { get; set; } + } + + /// + /// Text line information + /// + public class TextLineInfo + { + public Rect Bbox { get; set; } + public List Spans { get; set; } + } + + /// + /// Page layout information + /// + public class PageLayout + { + public int PageNumber { get; set; } + public float Width { get; set; } + public float Height { get; set; } + public List Boxes { get; set; } + public bool FullOcred { get; set; } // Whether the page is an OCR page + public bool TextOcred { get; set; } // Whether the page text only is OCR'd + public List FullText { get; set; } // Full page text in extractDICT format + public List Words { get; set; } // List of words with bbox (not yet activated) + public List Links { get; set; } + } + + /// + /// Parsed document structure and layout serialization helpers. + /// Ported and adapted from the Python module helpers/document_layout.py in pymupdf4llm. + /// + public class ParsedDocument + { + public string Filename { get; set; } // Source file name + public int PageCount { get; set; } + public List Toc { get; set; } // e.g. [{'title': 'Intro', 'page': 1}] + public List Pages { get; set; } + public Dictionary Metadata { get; set; } + public Dictionary> FormFields { get; set; } + public bool FromBytes { get; set; } // Whether loaded from bytes + public int ImageDpi { get; set; } = 150; // Image resolution + public string ImageFormat { get; set; } = "png"; // 'png' or 'jpg' + public string ImagePath { get; set; } = ""; // Path to save images + public bool UseOcr { get; set; } = true; // If beneficial invoke OCR + public bool ForceText { get; set; } + public bool EmbedImages { get; set; } + public bool WriteImages { get; set; } + + /// + /// Serialize the parsed document into Markdown text, closely following + /// ParsedDocument.to_markdown in the original Python implementation. + /// + public string ToMarkdown( + bool header = true, + bool footer = true, + bool writeImages = false, + bool embedImages = false, + bool ignoreCode = false, + bool showProgress = false, + bool pageSeparators = false, + bool pageChunks = false) + { + if (pageChunks) + { + throw new NotImplementedException("Page chunks mode not yet fully implemented"); + } + + var documentOutput = new StringBuilder(); + + foreach (var page in Pages) + { + var mdString = new StringBuilder(); + // Make a mapping: box number -> list item hierarchy level + var listItemLevels = CreateListItemLevels(page.Boxes); + + foreach (var (box, i) in page.Boxes.Select((b, idx) => (b, idx))) + { + var clip = new Rect(box.X0, box.Y0, box.X1, box.Y1); + string btype = box.BoxClass; + + if (btype == "page-header" && !header) + continue; + if (btype == "page-footer" && !footer) + continue; + + if (btype == "picture" || btype == "formula" || btype == "table-fallback") + { + if (box.Image != null) + { + if (embedImages) + { + // Make a base64 encoded string of the image + string base64 = Convert.ToBase64String(box.Image); + string data = $"data:image/{ImageFormat};base64,{base64}"; + mdString.Append($"\n![]({data})\n\n"); + } + else if (writeImages) + { + // Save image and reference it + mdString.Append($"\n![Image]({ImagePath})\n\n"); + } + } + else + { + mdString.Append($"**==> picture [{clip.Width} x {clip.Height}] intentionally omitted <==**\n\n"); + } + + // Output text in image if requested + if (box.TextLines != null && box.TextLines.Count > 0) + { + mdString.Append(TextToMd(box.TextLines, ignoreCode || page.FullOcred)); + } + } + else if (btype == "table" && box.Table != null) + { + if (box.Table.ContainsKey("markdown")) + { + string tableText = box.Table["markdown"].ToString(); + if (page.FullOcred) + // Remove code style if page was OCR'd + tableText = tableText.Replace("`", ""); + mdString.Append(tableText + "\n\n"); + } + } + else if (btype == "list-item") + { + int level = listItemLevels.ContainsKey(i) ? listItemLevels[i] : 1; + mdString.Append(ListItemToMd(box.TextLines, level)); + } + else if (btype == "footnote") + { + mdString.Append(FootnoteToMd(box.TextLines)); + } + else if (box.TextLines != null) + { + // Treat as normal MD text + mdString.Append(TextToMd(box.TextLines, ignoreCode || page.FullOcred)); + } + } + + if (pageSeparators) + { + mdString.Append($"--- end of page={page.PageNumber} ---\n\n"); + } + + documentOutput.Append(mdString.ToString()); + } + + return documentOutput.ToString(); + } + + /// + /// Serialize the parsed document into a JSON string, mirroring the behavior + /// of the Python ParsedDocument.to_json helper. + /// + public string ToJson() + { + // Serialize to JSON + var settings = new JsonSerializerSettings + { + Formatting = Formatting.Indented, + NullValueHandling = NullValueHandling.Ignore, + Converters = new List + { + new LayoutJsonConverter() + } + }; + + return JsonConvert.SerializeObject(this, settings); + } + + /// + /// Serialize the parsed document to plain text. + /// This follows the logic of ParsedDocument.to_text in the Python code, + /// including optional suppression of headers / footers and simple table rendering. + /// + public string ToText( + bool header = true, + bool footer = true, + bool ignoreCode = false, + bool showProgress = false, + bool pageChunks = false, + string tableFormat = "grid") + { + if (pageChunks) + { + throw new NotImplementedException("Page chunks mode not yet fully implemented"); + } + + var documentOutput = new StringBuilder(); + + foreach (var page in Pages) + { + var textString = new StringBuilder(); + var listItemLevels = CreateListItemLevels(page.Boxes); + + foreach (var (box, i) in page.Boxes.Select((b, idx) => (b, idx))) + { + var clip = new Rect(box.X0, box.Y0, box.X1, box.Y1); + string btype = box.BoxClass; + + if (btype == "page-header" && !header) + continue; + if (btype == "page-footer" && !footer) + continue; + + if (btype == "picture" || btype == "formula" || btype == "table-fallback") + { + textString.Append($"==> picture [{clip.Width} x {clip.Height}] <==\n\n"); + if (box.TextLines != null && box.TextLines.Count > 0) + { + textString.Append(TextToText(box.TextLines, ignoreCode || page.FullOcred)); + } + } + else if (btype == "table" && box.Table != null) + { + // Note: Table formatting would need tabulate equivalent + textString.Append("[Table]\n\n"); + } + else if (btype == "list-item") + { + int level = listItemLevels.ContainsKey(i) ? listItemLevels[i] : 1; + textString.Append(ListItemToText(box.TextLines, level)); + } + else if (btype == "footnote") + { + textString.Append(FootnoteToText(box.TextLines)); + } + else if (box.TextLines != null) + { + // Handle other cases as normal text + textString.Append(TextToText(box.TextLines, ignoreCode || page.FullOcred)); + } + } + + documentOutput.Append(textString.ToString()); + } + + return documentOutput.ToString(); + } + + // Helper methods for text conversion + private static string TitleToMd(List textLines) + { + var sb = new StringBuilder(); + foreach (var line in textLines) + { + if (line.Spans != null) + { + foreach (var span in line.Spans) + { + sb.Append(span.Text); + } + } + sb.Append("\n"); + } + return sb.ToString(); + } + + private static string SectionHdrToMd(List textLines) + { + var sb = new StringBuilder(); + foreach (var line in textLines) + { + if (line.Spans != null) + { + foreach (var span in line.Spans) + { + sb.Append(span.Text); + } + } + sb.Append("\n"); + } + return sb.ToString(); + } + + /// + /// Convert list-item layout boxes to markdown. + /// The first line is prefixed with -. Subsequent lines are appended + /// without a line break if their rectangle does not start to the left of + /// the previous line; otherwise a new markdown list item is started. + /// 2 units of tolerance is used to avoid spurious line breaks. + /// + /// The text line information for the list item. + /// The hierarchy level (1 for top-level). + private static string ListItemToMd(List textLines, int level) + { + var sb = new StringBuilder(); + string indent = new string(' ', (level - 1) * 2); // Indentation based on level + sb.Append(indent + "- "); + foreach (var line in textLines) + { + if (line.Spans != null) + { + foreach (var span in line.Spans) + { + sb.Append(span.Text); + } + } + } + sb.Append("\n"); + return sb.ToString(); + } + + /// + /// Convert footnote layout boxes to markdown. + /// The first line is prefixed with > ; subsequent lines start a + /// new blockquote when they begin with superscripted text. + /// We render footnotes as blockquotes. + /// + /// The text line information for the footnote. + private static string FootnoteToMd(List textLines) + { + var sb = new StringBuilder(); + // We render footnotes as blockquotes + sb.Append("[^"); + foreach (var line in textLines) + { + if (line.Spans != null) + { + foreach (var span in line.Spans) + { + sb.Append(span.Text); + } + } + } + sb.Append("]\n"); + return sb.ToString(); + } + + /// + /// Convert generic text layout boxes to markdown, as well as box classes + /// not specifically handled elsewhere. Lines are concatenated without + /// line breaks; at the end, two newlines are used to separate from the + /// next block. Monospaced spans may be emitted as code when + /// is false. + /// + /// The text line information to convert. + /// If true, do not emit code-style formatting. + private static string TextToMd(List textLines, bool ignoreCode) + { + // Handle completely monospaced textlines as code block + // Check for superscript - handle mis-classified text boundary box + if (textLines == null || textLines.Count == 0) + return ""; + + var sb = new StringBuilder(); + foreach (var line in textLines) + { + if (line.Spans != null) + { + foreach (var span in line.Spans) + { + string text = span.Text ?? ""; + if (!ignoreCode && span.Font != null && span.Font.Contains("Mono")) + { + text = "`" + text + "`"; + } + sb.Append(text); + } + } + sb.Append("\n"); + } + return sb.ToString(); + } + + /// + /// Convert list-item layout boxes to plain text. + /// The first line is prefixed with a dash and indentation according to + /// the hierarchy level; subsequent lines are concatenated. + /// + /// The text line information for the list item. + /// The hierarchy level (1 for top-level). + private static string ListItemToText(List textLines, int level) + { + if (textLines == null || textLines.Count == 0) + return ""; + + var sb = new StringBuilder(); + string indent = new string(' ', (level - 1) * 2); // Indentation based on level + sb.Append(indent + "- "); + foreach (var line in textLines) + { + if (line.Spans != null) + { + foreach (var span in line.Spans) + { + sb.Append(span.Text); + } + } + } + sb.Append("\n"); + return sb.ToString(); + } + + /// + /// Convert footnote layout boxes to plain text, concatenating + /// all spans into a single textual representation. + /// We render footnotes as blockquotes. + /// + /// The text line information for the footnote. + private static string FootnoteToText(List textLines) + { + if (textLines == null || textLines.Count == 0) + return ""; + + var sb = new StringBuilder(); + // We render footnotes as blockquotes + foreach (var line in textLines) + { + if (line.Spans != null) + { + foreach (var span in line.Spans) + { + sb.Append(span.Text); + } + } + } + sb.Append("\n"); + return sb.ToString(); + } + + /// + /// Convert generic text layout boxes to plain text. The text of all + /// spans of all lines is written without line breaks. + /// At the end, two newlines are added to separate from the next block. + /// + /// The text line information to convert. + /// Currently unused; included for parity with markdown conversion. + private static string TextToText(List textLines, bool ignoreCode) + { + if (textLines == null || textLines.Count == 0) + return ""; + + var sb = new StringBuilder(); + foreach (var line in textLines) + { + if (line.Spans != null) + { + foreach (var span in line.Spans) + { + sb.Append(span.Text); + } + } + sb.Append("\n"); + } + return sb.ToString(); + } + + /// + /// Map the layout box index of each list item to its hierarchy level. + /// + /// This post-layout heuristic walks contiguous segments of list-item + /// boxes and assigns increasing levels when the left coordinate moves + /// sufficiently to the right, mirroring + /// create_list_item_levels in the Python implementation. + /// + /// The list of layout boxes for the page. + /// + /// A dictionary mapping box index to level, where level is 1 for + /// top-level items. + /// + private static Dictionary CreateListItemLevels(List boxes) + { + var itemDict = new Dictionary(); // Dictionary of item index -> level + var segments = new List>(); // List of item segments + var currentSegment = new List<(int idx, LayoutBox box)>(); // Current segment + + // Create segments of contiguous list items. Each non-list-item finishes + // the current segment. Also, two list-items in a row belonging to different + // page text columns end the segment after the first item. + for (int i = 0; i < boxes.Count; i++) + { + var box = boxes[i]; + if (box.BoxClass != "list-item") // Bbox class is no list-item + { + if (currentSegment.Count > 0) // End and save the current segment + { + segments.Add(currentSegment); + currentSegment = new List<(int idx, LayoutBox box)>(); + } + continue; + } + + if (currentSegment.Count > 0) // Check if we need to end the current segment + { + var (prevIdx, prevBox) = currentSegment[currentSegment.Count - 1]; + if (box.X0 > prevBox.X1 || box.Y1 < prevBox.Y0) + { + // End and save the current segment + segments.Add(currentSegment); + currentSegment = new List<(int idx, LayoutBox box)>(); + } + } + currentSegment.Add((i, box)); // Append item to segment + } + if (currentSegment.Count > 0) + segments.Add(currentSegment); // Append last segment + + // Walk through segments and assign levels + foreach (var segment in segments) + { + if (segment.Count == 0) continue; // Skip empty segments + var sorted = segment.OrderBy(x => x.box.X0).ToList(); // Sort by x0 coordinate of the bbox + + // List of leveled items in the segment: (idx, bbox, level) + // First item has level 1 + var leveled = new List<(int idx, LayoutBox box, int level)> + { + (sorted[0].idx, sorted[0].box, 1) + }; + + for (int i = 1; i < sorted.Count; i++) + { + var (prevIdx, prevBox, prevLvl) = leveled[leveled.Count - 1]; + var (currIdx, currBox) = sorted[i]; + // X0 coordinate increased by more than 10 points: increase level + int currLvl = currBox.X0 > prevBox.X0 + 10 ? prevLvl + 1 : prevLvl; + leveled.Add((currIdx, currBox, currLvl)); + } + + foreach (var (idx, box, lvl) in leveled) + { + itemDict[idx] = lvl; + } + } + + return itemDict; + } + } + + /// + /// Document layout parsing utilities. + /// Provides a C# equivalent of pymupdf4llm.helpers.document_layout.parse_document. + /// + public static class DocumentLayout + { + /// + /// Parse document structure + /// + public static ParsedDocument ParseDocument( + Document doc, + string filename = "", + int imageDpi = 150, + string imageFormat = "png", + string imagePath = "", + int ocrDpi = 400, + List pages = null, + bool writeImages = false, + bool embedImages = false, + bool showProgress = false, + bool forceText = true, + bool useOcr = true, + string ocrLanguage = "eng") + { + // Note: Remove StructTreeRoot to avoid possible performance degradation. + // We will not use the structure tree anyway. + if (embedImages && writeImages) + throw new ArgumentException("Cannot both embed and write images."); + + var document = new ParsedDocument + { + Filename = !string.IsNullOrEmpty(filename) ? filename : doc.Name, + PageCount = doc.PageCount, + Toc = doc.GetToc().Cast().ToList(), + Metadata = doc.MetaData, + FormFields = Utils.ExtractFormFieldsWithPages(doc), + ImageDpi = imageDpi, + ImageFormat = imageFormat, + ImagePath = imagePath, + UseOcr = useOcr, + ForceText = forceText, + EmbedImages = embedImages, + WriteImages = writeImages, + Pages = new List() + }; + + if (pages == null) + pages = Enumerable.Range(0, doc.PageCount).ToList(); + + var progressBar = showProgress && pages.Count > 5 + ? ProgressBar.Create(pages.Cast().ToList()) + : null; + + try + { + foreach (int pno in pages) + { + if (progressBar != null && !progressBar.MoveNext()) + break; + + Page page = doc.LoadPage(pno); + try + { + TextPage textPage = page.GetTextPage( + clip: new Rect(float.NegativeInfinity, float.NegativeInfinity, + float.PositiveInfinity, float.PositiveInfinity), + flags: Utils.FLAGS); + PageInfo pageInfo = textPage.ExtractDict(null, false); + List blocks = pageInfo.Blocks; + + bool pageFullOcred = false; + bool pageTextOcred = false; + + // Check if this page should be OCR'd + if (useOcr) + { + var decision = CheckOcr.ShouldOcrPage(page, dpi: ocrDpi, blocks: blocks); + // Prevent MD styling if already OCR'd + pageFullOcred = decision.TryGetValue("has_ocr_text", out var hasOcrText) ? (bool)hasOcrText : false; + + if (decision.TryGetValue("should_ocr", out var shouldOcr) && (bool)shouldOcr) + { + // We should be OCR: check full-page vs. text-only + if (decision.ContainsKey("pixmap") && decision["pixmap"] != null) + { + // Full-page OCR would be implemented here + // Retrieve the Pixmap, OCR it, get the OCR'd PDF, copy text over to original page + pageFullOcred = true; + } + else + { + blocks = CheckOcr.RepairBlocks(blocks, page); + pageTextOcred = true; + } + } + } + + var pageLayout = new PageLayout + { + PageNumber = pno, + Width = page.Rect.Width, + Height = page.Rect.Height, + Boxes = new List(), + FullOcred = pageFullOcred, + TextOcred = pageTextOcred, + FullText = blocks, + Words = new List(), + Links = page.GetLinks() + }; + + // Extract text lines for each block + // Each line is represented as its bbox and a list of spans + var lines = GetTextLines.GetRawLines(textPage, blocks, page.Rect); + + foreach (var line in lines) + { + var layoutBox = new LayoutBox + { + X0 = line.Rect.X0, + Y0 = line.Rect.Y0, + X1 = line.Rect.X1, + Y1 = line.Rect.Y1, + BoxClass = "text", + TextLines = new List + { + new TextLineInfo + { + Bbox = line.Rect, + Spans = line.Spans + } + } + }; + + pageLayout.Boxes.Add(layoutBox); + } + + document.Pages.Add(pageLayout); + textPage.Dispose(); + } + finally + { + page.Dispose(); + } + } + } + finally + { + progressBar?.Dispose(); + } + + return document; + } + } + + /// + /// Custom JSON converter for Layout objects + /// + public class LayoutJsonConverter : JsonConverter + { + public override bool CanConvert(Type objectType) + { + return objectType == typeof(byte[]) || + objectType == typeof(Rect) || + objectType == typeof(Point) || + objectType == typeof(Matrix) || + objectType == typeof(IRect) || + objectType == typeof(Quad); + } + + public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer) + { + throw new NotImplementedException("Deserialization not implemented"); + } + + public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer) + { + if (value == null) + { + writer.WriteNull(); + return; + } + + if (value is byte[] bytes) + { + string base64 = Convert.ToBase64String(bytes); + writer.WriteValue(base64); + } + else if (value is Rect rect) + { + writer.WriteStartArray(); + writer.WriteValue(rect.X0); + writer.WriteValue(rect.Y0); + writer.WriteValue(rect.X1); + writer.WriteValue(rect.Y1); + writer.WriteEndArray(); + } + else if (value is Point point) + { + writer.WriteStartArray(); + writer.WriteValue(point.X); + writer.WriteValue(point.Y); + writer.WriteEndArray(); + } + else if (value is Matrix matrix) + { + writer.WriteStartArray(); + writer.WriteValue(matrix.A); + writer.WriteValue(matrix.B); + writer.WriteValue(matrix.C); + writer.WriteValue(matrix.D); + writer.WriteValue(matrix.E); + writer.WriteValue(matrix.F); + writer.WriteEndArray(); + } + else if (value is IRect irect) + { + writer.WriteStartArray(); + writer.WriteValue(irect.X0); + writer.WriteValue(irect.Y0); + writer.WriteValue(irect.X1); + writer.WriteValue(irect.Y1); + writer.WriteEndArray(); + } + else if (value is Quad quad) + { + writer.WriteStartArray(); + writer.WriteStartArray(); + writer.WriteValue(quad.UpperLeft.X); + writer.WriteValue(quad.UpperLeft.Y); + writer.WriteEndArray(); + writer.WriteStartArray(); + writer.WriteValue(quad.UpperRight.X); + writer.WriteValue(quad.UpperRight.Y); + writer.WriteEndArray(); + writer.WriteStartArray(); + writer.WriteValue(quad.LowerLeft.X); + writer.WriteValue(quad.LowerLeft.Y); + writer.WriteEndArray(); + writer.WriteStartArray(); + writer.WriteValue(quad.LowerRight.X); + writer.WriteValue(quad.LowerRight.Y); + writer.WriteEndArray(); + writer.WriteEndArray(); + } + else + { + writer.WriteNull(); + } + } + } +} diff --git a/MuPDF.NET4LLM/helpers/ExtendedSpan.cs b/MuPDF.NET4LLM/helpers/ExtendedSpan.cs new file mode 100644 index 0000000..5c9e4c8 --- /dev/null +++ b/MuPDF.NET4LLM/helpers/ExtendedSpan.cs @@ -0,0 +1,24 @@ +using System.Collections.Generic; +using MuPDF.NET; + +namespace MuPDF.NET4LLM.Helpers +{ + /// + /// Extended span information for text line extraction. + /// Mirrors the span dictionaries produced by pymupdf4llm in the Python helpers. + /// + public class ExtendedSpan + { + public string Text { get; set; } + public Rect Bbox { get; set; } + public float Size { get; set; } + public string Font { get; set; } + public int Flags { get; set; } + public int CharFlags { get; set; } + public float Alpha { get; set; } + public int Line { get; set; } + public int Block { get; set; } + public Point Dir { get; set; } + public List Chars { get; set; } + } +} diff --git a/MuPDF.NET4LLM/helpers/GetTextLines.cs b/MuPDF.NET4LLM/helpers/GetTextLines.cs new file mode 100644 index 0000000..58e23c6 --- /dev/null +++ b/MuPDF.NET4LLM/helpers/GetTextLines.cs @@ -0,0 +1,442 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using MuPDF.NET; + +namespace MuPDF.NET4LLM.Helpers +{ + /// + /// Represents a line with its rectangle and spans + /// + public class TextLine + { + public Rect Rect { get; set; } + public List Spans { get; set; } + } + + /// + /// Text line extraction utilities. + /// Ported and adapted from the Python module helpers/get_text_lines.py in pymupdf4llm. + /// + public static class GetTextLines + { + /// + /// Extract the text spans from a in natural reading order. + /// All spans whose vertical positions are within of + /// each other are merged into a single logical line, mirroring the behavior of + /// get_raw_lines() in helpers/get_text_lines.py. + /// + /// + /// This is a C# port of pymupdf4llm.helpers.get_text_lines.get_raw_lines. + /// It compensates for MuPDF’s tendency to create multiple short lines when spans + /// are separated by small gaps, by joining adjacent spans into longer lines. + /// + /// The result is a list of objects, each containing a + /// joined line rectangle and a left‑to‑right sorted list of + /// items. Each span is annotated with its original block / line index so that + /// callers can still detect original MuPDF line breaks if needed. + /// + /// + /// Source . May be null if + /// are provided directly. + /// + /// + /// Optional list of objects to reuse an existing + /// ExtractDict result instead of re‑extracting from . + /// Only text blocks (Type == 0) with non‑empty bounding boxes are considered. + /// + /// +/// Optional clipping rectangle. Only spans whose bounding boxes overlap this +/// area (within ) are taken into account. + /// + /// + /// Maximum vertical distance (in points) between span baselines or tops for + /// them to be considered part of the same logical line (default: 3). + /// + /// + /// When true, spans with zero alpha (invisible text) are skipped, except + /// for Type 3 fonts (which are always kept), matching the Python logic. + /// + /// + /// When true, only spans with approximately horizontal direction + /// vectors are included (i.e. abs(1 - dir.x) <= 1e‑3), ignoring + /// vertical or rotated text. + /// + /// + /// A list of objects. If no spans are found, an + /// empty list is returned. + /// + public static List GetRawLines( + TextPage textpage = null, + List blocks = null, + Rect clip = null, + float tolerance = 3.0f, + bool ignoreInvisible = true, + bool onlyHorizontal = true) + { + float yDelta = tolerance; // Allowable vertical coordinate deviation + + if (textpage == null && blocks == null) + throw new ArgumentException("Either textpage or blocks must be provided."); + + if (clip == null && textpage != null) + { + // Use TextPage rect if not provided + clip = new Rect(float.NegativeInfinity, float.NegativeInfinity, + float.PositiveInfinity, float.PositiveInfinity); + } + + // Extract text blocks - if bbox is not empty + if (blocks == null && textpage != null) + { + PageInfo pageInfo = textpage.ExtractDict(null, false); + blocks = pageInfo.Blocks?.Where(b => b.Type == 0 && !Utils.BboxIsEmpty(b.Bbox)).ToList(); + } + + if (blocks == null) + blocks = new List(); + + List spans = new List(); // All spans in TextPage here + + for (int bno = 0; bno < blocks.Count; bno++) // The numbered blocks + { + Block b = blocks[bno]; + if (Utils.OutsideBbox(b.Bbox, clip)) + continue; + + if (b.Lines == null) + continue; + + for (int lno = 0; lno < b.Lines.Count; lno++) // The numbered lines + { + Line line = b.Lines[lno]; + if (Utils.OutsideBbox(line.Bbox, clip)) + continue; + + Point lineDir = line.Dir; + if (onlyHorizontal && Math.Abs(1 - lineDir.X) > 1e-3) // Only accept horizontal text + continue; + + if (line.Spans == null) + continue; + + for (int sno = 0; sno < line.Spans.Count; sno++) // The numbered spans + { + Span s = line.Spans[sno]; + string text = s.Text ?? ""; + + if (Utils.IsWhite(text)) + // Ignore white text if not a Type3 font + continue; + + // Ignore invisible text. Type 3 font text is never invisible. + // Note: Alpha and CharFlags may need different access in MuPDF.NET + if (s.Font != Utils.TYPE3_FONT_NAME && ignoreInvisible) + { + // Skip invisible text if needed - would need Alpha property + // For now, continue + } + + if (!Utils.AlmostInBbox(s.Bbox, clip)) // If not in clip + continue; + + Rect sbbox = new Rect(s.Bbox); // Span bbox as a Rect + if (((int)s.Flags & 1) != 0) // If a superscript, modify bbox + { + // With that of the preceding or following span + int i = sno == 0 ? 1 : sno - 1; + if (line.Spans.Count > i) + { + Span neighbor = line.Spans[i]; + sbbox.Y1 = neighbor.Bbox.Y1; + } + text = $"[{text}]"; + } + + sbbox = sbbox; // Update with the Rect version + // Include line/block numbers to facilitate separator insertion + ExtendedSpan extSpan = new ExtendedSpan + { + Text = text, + Bbox = sbbox, + Size = s.Size, + Font = s.Font, + Flags = (int)s.Flags, + CharFlags = 0, // Would need to extract from Span if available + Alpha = 1.0f, // Would need to extract from Span if available + Line = lno, + Block = bno, + Dir = lineDir, + Chars = s.Chars + }; + + spans.Add(extSpan); + } + } + } + + if (spans.Count == 0) // No text at all + return new List(); + + // Sort spans by bottom coord + spans = spans.OrderBy(s => -s.Dir.X).ThenBy(s => s.Bbox.Y1).ToList(); + + List nlines = new List(); // Final result + List currentLine = new List { spans[0] }; // Collects spans with fitting vertical coordinates + Rect lrect = new Rect(spans[0].Bbox); // Rectangle joined from span rectangles + + for (int i = 1; i < spans.Count; i++) // Walk through the spans + { + ExtendedSpan s = spans[i]; + Rect sbbox = s.Bbox; // This bbox + Rect sbbox0 = currentLine[currentLine.Count - 1].Bbox; // Previous bbox + // If any of top or bottom coordinates are close enough, join... + if (Math.Abs(sbbox.Y1 - sbbox0.Y1) <= yDelta || + Math.Abs(sbbox.Y0 - sbbox0.Y0) <= yDelta) + { + currentLine.Add(s); // Append to this line + lrect = Utils.JoinRects(new List { lrect, sbbox }); // Extend line rectangle + continue; + } + + // End of current line, sort its spans from left to right + currentLine = SanitizeSpans(currentLine); + + // Append line rect and its spans to final output + nlines.Add(new TextLine { Rect = lrect, Spans = currentLine }); + + currentLine = new List { s }; // Start next line + lrect = new Rect(sbbox); // Initialize its rectangle + } + + // Need to append last line in the same way + currentLine = SanitizeSpans(currentLine); + nlines.Add(new TextLine { Rect = lrect, Spans = currentLine }); + + return nlines; + } + + /// + /// Sort and join spans within a single logical line. + /// + /// + /// This corresponds to the inner sanitize_spans() helper in + /// get_text_lines.py. Spans are first sorted left‑to‑right and then + /// adjacent spans with nearly touching x‑coordinates and identical style + /// (font flags and character flags, except superscript) are merged into a + /// single by concatenating their text and + /// joining their bounding boxes. + /// + private static List SanitizeSpans(List line) + { + if (line.Count == 0) + return line; + + // Sort ascending horizontally + line = line.OrderBy(s => s.Bbox.X0).ToList(); + // Join spans, delete duplicates + // Underline differences are being ignored + for (int i = line.Count - 1; i > 0; i--) // Iterate back to front + { + ExtendedSpan s0 = line[i - 1]; // Preceding span + ExtendedSpan s1 = line[i]; // This span + // "Delta" depends on the font size. Spans will be joined if + // no more than 10% of the font size separates them and important + // attributes are the same. + float delta = s1.Size * 0.1f; + if (s0.Bbox.X1 + delta < s1.Bbox.X0 || + s0.Flags != s1.Flags || + (s0.CharFlags & ~2) != (s1.CharFlags & ~2)) + { + continue; // No joining + } + // We need to join bbox and text of two consecutive spans + // Sometimes, spans may also be duplicated. + if (s0.Text != s1.Text || !s0.Bbox.EqualTo(s1.Bbox)) + { + s0.Text += s1.Text; + } + s0.Bbox = Utils.JoinRects(new List { s0.Bbox, s1.Bbox }); // Join boundary boxes + line.RemoveAt(i); // Delete the joined-in span + line[i - 1] = s0; // Update the span + } + + return line; + } + + /// + /// Extract plain text line‑by‑line in natural reading order. + /// + /// + /// This is the C# equivalent of get_text_lines() in + /// helpers/get_text_lines.py. It first obtains logical lines via + /// , + /// then concatenates spans on the same original MuPDF line, inserting the + /// separator when a new original line continues the + /// same logical line. + /// + /// For non‑OCR text ( = false), this produces + /// continuous text suitable for indexing while preserving a reasonable + /// reading order, including extra blank lines between text blocks. + /// + /// When is true, a simplified table recognition + /// is applied to the OCR output: lines are grouped into columns based on + /// x‑coordinates and emitted as a Markdown table, analogous to the Python + /// implementation. + /// + /// The source to extract text from. + /// + /// Optional pre‑created . When null, this method +/// will create a temporary text page (or OCR text page if +/// is true) and dispose it afterwards. + /// + /// + /// Optional clipping rectangle restricting the area from which lines are read. + /// + /// + /// Separator string used when joining multiple MuPDF lines that are merged + /// into a single logical line (default: tab, matching the Python version). + /// + /// + /// Vertical tolerance passed through to . + /// + /// + /// When true, uses OCR text extraction and applies rudimentary + /// table reconstruction, returning a Markdown‑style table for tabular OCR output. + /// + /// + /// A string containing the page text in reading order. For non‑OCR mode, + /// this is plain text with line breaks and block separators. For OCR mode, + /// it may contain Markdown‑style tables. + /// + public static string GetTextLinesFormatted( + Page page, + TextPage textpage = null, + Rect clip = null, + string sep = "\t", + float tolerance = 3.0f, + bool ocr = false) + { + int textFlags = (int)TextFlags.TEXT_MEDIABOX_CLIP; + page.SetRotation(0); + Rect prect = clip ?? page.Rect; // Area to consider + + string xsep = sep == "|" ? "" : sep; + + // Make a TextPage if required + TextPage tp = textpage; + bool disposeTp = false; + + if (tp == null) + { + if (!ocr) + { + tp = page.GetTextPage(clip: prect, flags: textFlags); + } + else + { + tp = page.GetTextPageOcr(dpi: 300, full: true); + } + disposeTp = true; + } + + List lines = GetRawLines(tp, null, prect, tolerance); + + if (disposeTp) // Delete temp TextPage + { + tp?.Dispose(); + } + + if (lines == null || lines.Count == 0) + return ""; + + string alltext = ""; + + // Compose final text + if (!ocr) + { + int prevBno = -1; // Number of previous text block + foreach (var (lrect, line) in lines.Select(l => (l.Rect, l.Spans))) // Iterate through lines + { + // Insert extra line break if a different block + int bno = line[0].Block; // Block number of this line + if (bno != prevBno) + { + alltext += "\n"; + } + prevBno = bno; + + int lineNo = line[0].Line; // Store the line number of previous span + foreach (var s in line) // Walk over the spans in the line + { + int lno = s.Line; + string stext = s.Text; + if (lineNo == lno) + { + alltext += stext; + } + else + { + alltext += sep + stext; + } + lineNo = lno; + } + alltext += "\n"; // Append line break after a line + } + alltext += "\n"; // Append line break at end of block + return alltext; + } + + // For OCR output, we try a rudimentary table recognition. + List> rows = new List>(); + List xvalues = new List(); + int colCount = 0; + + foreach (var (lrect, line) in lines.Select(l => (l.Rect, l.Spans))) + { + // If only 1 span in line and no columns identified yet... + if (line.Count == 1 && xvalues.Count == 0) + { + alltext += line[0].Text + "\n\n\n"; + continue; + } + // Multiple spans in line and no columns identified yet + else if (xvalues.Count == 0) // Define column borders + { + xvalues = line.Select(s => s.Bbox.X0).ToList(); + xvalues.Add(line[line.Count - 1].Bbox.X1); + colCount = line.Count; // Number of columns + } + + List row = new List(new string[colCount]); + foreach (var s in line) + { + for (int i = 0; i < xvalues.Count - 1; i++) + { + float x0 = xvalues[i]; + float x1 = xvalues[i + 1]; + if (Math.Abs(s.Bbox.X0 - x0) <= 3 || Math.Abs(s.Bbox.X1 - x1) <= 3) + { + row[i] = s.Text; + } + } + } + rows.Add(row); + } + + if (rows.Count > 0 && rows[0].Count > 0) + { + string header = "|" + string.Join("|", rows[0]) + "|\n"; + alltext += header; + alltext += "|" + string.Join("|", Enumerable.Range(0, rows[0].Count).Select(_ => "---")) + "|\n"; + for (int i = 1; i < rows.Count; i++) + { + alltext += "|" + string.Join("|", rows[i]) + "|\n"; + } + alltext += "\n"; + } + + return alltext; + } + } +} diff --git a/MuPDF.NET4LLM/helpers/ImageQuality.cs b/MuPDF.NET4LLM/helpers/ImageQuality.cs new file mode 100644 index 0000000..af6d64d --- /dev/null +++ b/MuPDF.NET4LLM/helpers/ImageQuality.cs @@ -0,0 +1,694 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Numerics; + +namespace MuPDF.NET4LLM.Helpers +{ + /// + /// Image quality analysis utilities. + /// Ported and adapted from the Python module helpers/image_quality.py in pymupdf4llm. + /// + public static class ImageQuality + { + /// + /// Bilinear resize (similar to OpenCV INTER_LINEAR), vectorized implementation in Python. + /// + /// Input image (2D byte array). + /// New height. + /// New width. + /// Resized image. + public static byte[,] ResizeBilinear(byte[,] img, int newH, int newW) + { + int h = img.GetLength(0); + int w = img.GetLength(1); + float[,] imgFloat = new float[h, w]; + for (int y = 0; y < h; y++) + for (int x = 0; x < w; x++) + imgFloat[y, x] = img[y, x]; + + // Target coordinates + float[] ys = new float[newH]; + float[] xs = new float[newW]; + for (int i = 0; i < newH; i++) + ys[i] = (i + 0.5f) * (h / (float)newH) - 0.5f; + for (int i = 0; i < newW; i++) + xs[i] = (i + 0.5f) * (w / (float)newW) - 0.5f; + + for (int i = 0; i < newH; i++) + ys[i] = Math.Max(0, Math.Min(h - 1, ys[i])); + for (int i = 0; i < newW; i++) + xs[i] = Math.Max(0, Math.Min(w - 1, xs[i])); + + int[] y0 = new int[newH]; + int[] x0 = new int[newW]; + for (int i = 0; i < newH; i++) + y0[i] = (int)Math.Floor(ys[i]); + for (int i = 0; i < newW; i++) + x0[i] = (int)Math.Floor(xs[i]); + + int[] y1 = new int[newH]; + int[] x1 = new int[newW]; + for (int i = 0; i < newH; i++) + y1[i] = Math.Min(h - 1, y0[i] + 1); + for (int i = 0; i < newW; i++) + x1[i] = Math.Min(w - 1, x0[i] + 1); + + byte[,] outImg = new byte[newH, newW]; + for (int y = 0; y < newH; y++) + { + float wy = ys[y] - y0[y]; + for (int x = 0; x < newW; x++) + { + float wx = xs[x] - x0[x]; + // Four corner values via fancy indexing + float Ia = imgFloat[y0[y], x0[x]]; // Top-left + float Ib = imgFloat[y0[y], x1[x]]; // Top-right + float Ic = imgFloat[y1[y], x0[x]]; // Bottom-left + float Id = imgFloat[y1[y], x1[x]]; // Bottom-right + + float top = Ia * (1 - wx) + Ib * wx; + float bottom = Ic * (1 - wx) + Id * wx; + float val = top * (1 - wy) + bottom * wy; + outImg[y, x] = (byte)Math.Max(0, Math.Min(255, val)); + } + } + return outImg; + } + + /// + /// 2D convolution (Cross-Correlation) with reflect padding. + /// Vectorized over kernel in Python. + /// + /// Input image. + /// Convolution kernel. + /// Convolved image. + public static float[,] Convolve2D(float[,] img, float[,] kernel) + { + int kh = kernel.GetLength(0); + int kw = kernel.GetLength(1); + int padH = kh / 2; + int padW = kw / 2; + + int H = img.GetLength(0); + int W = img.GetLength(1); + float[,] padded = new float[H + 2 * padH, W + 2 * padW]; + for (int y = 0; y < H; y++) + for (int x = 0; x < W; x++) + padded[y + padH, x + padW] = img[y, x]; + + // Reflect padding + for (int y = 0; y < padH; y++) + for (int x = 0; x < W; x++) + padded[y, x + padW] = img[padH - y, x]; + for (int y = H + padH; y < H + 2 * padH; y++) + for (int x = 0; x < W; x++) + padded[y, x + padW] = img[2 * H + padH - y - 1, x]; + for (int y = 0; y < H + 2 * padH; y++) + for (int x = 0; x < padW; x++) + padded[y, x] = padded[y, 2 * padW - x]; + for (int y = 0; y < H + 2 * padH; y++) + for (int x = W + padW; x < W + 2 * padW; x++) + padded[y, x] = padded[y, 2 * (W + padW) - x - 2]; + + float[,] output = new float[H, W]; + // Loop only over kernel offsets, not over pixels + for (int i = 0; i < kh; i++) + { + for (int j = 0; j < kw; j++) + { + for (int y = 0; y < H; y++) + { + for (int x = 0; x < W; x++) + { + output[y, x] += kernel[i, j] * padded[y + i, x + j]; + } + } + } + } + return output; + } + + /// + /// 1D Gaussian kernel + /// + public static float[] GaussianKernel1D(int size = 5, float sigma = 1.0f) + { + float[] kernel = new float[size]; + int center = size / 2; + float sum = 0; + for (int i = 0; i < size; i++) + { + float x = i - center; + kernel[i] = (float)Math.Exp(-0.5 * (x / sigma) * (x / sigma)); + sum += kernel[i]; + } + for (int i = 0; i < size; i++) + kernel[i] /= sum; + return kernel; + } + + /// + /// Separable Gaussian Blur: first horizontal, then vertical. + /// + /// Input image. + /// Kernel size. + /// Sigma value. + /// Blurred image. + public static float[,] GaussianBlur(float[,] img, int ksize = 5, float sigma = 1.0f) + { + float[] kernel = GaussianKernel1D(ksize, sigma); + int H = img.GetLength(0); + int W = img.GetLength(1); + int pad = ksize / 2; + + // Horizontal + float[,] padded = new float[H, W + 2 * pad]; + for (int y = 0; y < H; y++) + for (int x = 0; x < W; x++) + padded[y, x + pad] = img[y, x]; + // Reflect padding + for (int y = 0; y < H; y++) + for (int x = 0; x < pad; x++) + padded[y, x] = padded[y, 2 * pad - x]; + for (int y = 0; y < H; y++) + for (int x = W + pad; x < W + 2 * pad; x++) + padded[y, x] = padded[y, 2 * (W + pad) - x - 2]; + + float[,] tmp = new float[H, W]; + for (int y = 0; y < H; y++) + { + for (int x = 0; x < W; x++) + { + float sum = 0; + for (int j = 0; j < ksize; j++) + sum += kernel[j] * padded[y, x + j]; + tmp[y, x] = sum; + } + } + + // Vertical + padded = new float[H + 2 * pad, W]; + for (int y = 0; y < H; y++) + for (int x = 0; x < W; x++) + padded[y + pad, x] = tmp[y, x]; + // Reflect padding + for (int y = 0; y < pad; y++) + for (int x = 0; x < W; x++) + padded[y, x] = padded[2 * pad - y, x]; + for (int y = H + pad; y < H + 2 * pad; y++) + for (int x = 0; x < W; x++) + padded[y, x] = padded[2 * (H + pad) - y - 2, x]; + + float[,] output = new float[H, W]; + for (int y = 0; y < H; y++) + { + for (int x = 0; x < W; x++) + { + float sum = 0; + for (int i = 0; i < ksize; i++) + sum += kernel[i] * padded[y + i, x]; + output[y, x] = sum; + } + } + return output; + } + + /// + /// Sobel gradients in x/y, Magnitude and Angle. + /// + /// Input image. + /// Magnitude and Angle matrices. + public static (float[,] mag, float[,] ang) SobelGradients(byte[,] img) + { + float[,] imgFloat = new float[img.GetLength(0), img.GetLength(1)]; + for (int y = 0; y < img.GetLength(0); y++) + for (int x = 0; x < img.GetLength(1); x++) + imgFloat[y, x] = img[y, x]; + + float[,] Kx = new float[,] { { -1, 0, 1 }, { -2, 0, 2 }, { -1, 0, 1 } }; + float[,] Ky = new float[,] { { -1, -2, -1 }, { 0, 0, 0 }, { 1, 2, 1 } }; + + float[,] gx = Convolve2D(imgFloat, Kx); + float[,] gy = Convolve2D(imgFloat, Ky); + + int H = img.GetLength(0); + int W = img.GetLength(1); + float[,] mag = new float[H, W]; + float[,] ang = new float[H, W]; + for (int y = 0; y < H; y++) + { + for (int x = 0; x < W; x++) + { + mag[y, x] = (float)Math.Sqrt(gx[y, x] * gx[y, x] + gy[y, x] * gy[y, x]); + ang[y, x] = (float)Math.Atan2(gy[y, x], gx[y, x]); + } + } + return (mag, ang); + } + + /// + /// Shannon entropy check over 256-bin histogram. + /// + /// Input image. + /// Entropy threshold. + /// Entropy value and pass/fail status. + public static (double entropy, bool passed) EntropyCheck(byte[,] img, double threshold = 5.0) + { + int[] hist = new int[256]; + int H = img.GetLength(0); + int W = img.GetLength(1); + for (int y = 0; y < H; y++) + for (int x = 0; x < W; x++) + hist[img[y, x]]++; + + double total = H * W; + double entropy = 0; + for (int i = 0; i < 256; i++) + { + if (hist[i] > 0) + { + double p = hist[i] / total; + entropy -= p * Math.Log(p, 2); + } + } + return (entropy, entropy >= threshold); + } + + /// + /// Low-Frequency-Ratio in FFT spectrum. + /// Internally rescales to 128x128. + /// + /// Input grayscale image. + /// Ratio threshold. + /// Ratio value and pass/fail status. + public static (double ratio, bool passed) FftCheck(byte[,] imgGray, double threshold = 0.15) + { + byte[,] small = ResizeBilinear(imgGray, 128, 128); + Complex[,] f = Fft2D(small); + Complex[,] fshift = FftShift(f); + double[,] magnitude = new double[128, 128]; + for (int y = 0; y < 128; y++) + for (int x = 0; x < 128; x++) + magnitude[y, x] = fshift[y, x].Magnitude; + + int h = 128, w = 128; + double centerSum = 0; + double totalSum = 0; + for (int y = h / 4; y < 3 * h / 4; y++) + for (int x = w / 4; x < 3 * w / 4; x++) + centerSum += magnitude[y, x]; + for (int y = 0; y < h; y++) + for (int x = 0; x < w; x++) + totalSum += magnitude[y, x]; + + double ratio = centerSum / totalSum; + return (ratio, ratio < threshold); + } + + /// + /// Simple 2D FFT using System.Numerics + /// + private static Complex[,] Fft2D(byte[,] img) + { + int H = img.GetLength(0); + int W = img.GetLength(1); + Complex[,] result = new Complex[H, W]; + for (int y = 0; y < H; y++) + { + for (int x = 0; x < W; x++) + { + // Simplified FFT - for production, use a proper FFT library + // This is a placeholder that converts to complex + result[y, x] = new Complex(img[y, x], 0); + } + } + // Note: Full 2D FFT implementation would be needed for production + return result; + } + + private static Complex[,] FftShift(Complex[,] f) + { + int H = f.GetLength(0); + int W = f.GetLength(1); + Complex[,] shifted = new Complex[H, W]; + int h2 = H / 2, w2 = W / 2; + for (int y = 0; y < H; y++) + for (int x = 0; x < W; x++) + shifted[(y + h2) % H, (x + w2) % W] = f[y, x]; + return shifted; + } + + /// + /// Otsu Thresholding. + /// + /// Input image. + /// Binary image (0 or 255). + public static byte[,] OtsuThreshold(byte[,] img) + { + int[] hist = new int[256]; + int H = img.GetLength(0); + int W = img.GetLength(1); + int total = H * W; + for (int y = 0; y < H; y++) + for (int x = 0; x < W; x++) + hist[img[y, x]]++; + + long sumTotal = 0; + for (int i = 0; i < 256; i++) + sumTotal += i * hist[i]; + + long sumB = 0; + long wB = 0; + double maxVar = 0; + int threshold = 0; + + for (int t = 0; t < 256; t++) + { + wB += hist[t]; + if (wB == 0) continue; + long wF = total - wB; + if (wF == 0) break; + + sumB += t * hist[t]; + double mB = sumB / (double)wB; + double mF = (sumTotal - sumB) / (double)wF; + double varBetween = wB * wF * (mB - mF) * (mB - mF); + + if (varBetween > maxVar) + { + maxVar = varBetween; + threshold = t; + } + } + + byte[,] binary = new byte[H, W]; + for (int y = 0; y < H; y++) + for (int x = 0; x < W; x++) + binary[y, x] = (byte)(img[y, x] > threshold ? 255 : 0); + return binary; + } + + /// + /// 8-connectivity Connected Components, Union-Find based two-pass approach. + /// + /// Input binary image (0 background, !=0 foreground). + /// Minimum component count threshold. + /// Component count and pass/fail status. + public static (int components, bool passed) ComponentsCheck(byte[,] binaryImg, int threshold = 10) + { + int H = binaryImg.GetLength(0); + int W = binaryImg.GetLength(1); + int[,] labels = new int[H, W]; + int maxLabels = H * W / 2 + 1; + int[] parent = new int[maxLabels]; + int[] rank = new int[maxLabels]; + for (int i = 0; i < maxLabels; i++) + parent[i] = i; + + int nextLabel = 1; + + int Find(int x) + { + while (parent[x] != x) + { + parent[x] = parent[parent[x]]; + x = parent[x]; + } + return x; + } + + void Union(int a, int b) + { + int ra = Find(a); + int rb = Find(b); + if (ra == rb) return; + if (rank[ra] < rank[rb]) + parent[ra] = rb; + else if (rank[ra] > rank[rb]) + parent[rb] = ra; + else + { + parent[rb] = ra; + rank[ra]++; + } + } + + // First pass + for (int y = 0; y < H; y++) + { + for (int x = 0; x < W; x++) + { + if (binaryImg[y, x] == 0) continue; + + List neighbors = new List(); + int[] dy = { -1, -1, -1, 0 }; + int[] dx = { -1, 0, 1, -1 }; + for (int i = 0; i < 4; i++) + { + int ny = y + dy[i]; + int nx = x + dx[i]; + if (ny >= 0 && ny < H && nx >= 0 && nx < W && labels[ny, nx] > 0) + neighbors.Add(labels[ny, nx]); + } + + if (neighbors.Count == 0) + { + labels[y, x] = nextLabel++; + } + else + { + int m = neighbors.Min(); + labels[y, x] = m; + foreach (int n in neighbors) + if (n != m) Union(m, n); + } + } + } + + // Second pass: Label flattening + Dictionary labelMap = new Dictionary(); + int current = 1; + for (int y = 0; y < H; y++) + { + for (int x = 0; x < W; x++) + { + if (labels[y, x] > 0) + { + int root = Find(labels[y, x]); + if (!labelMap.ContainsKey(root)) + labelMap[root] = current++; + labels[y, x] = labelMap[root]; + } + } + } + + int components = current - 1; + return (components, components >= threshold); + } + + /// + /// Non-maximum suppression + /// + public static float[,] NonMaxSuppression(float[,] mag, float[,] ang) + { + int H = mag.GetLength(0); + int W = mag.GetLength(1); + float[,] Z = new float[H, W]; + float[,] angDeg = new float[H, W]; + for (int y = 0; y < H; y++) + for (int x = 0; x < W; x++) + { + angDeg[y, x] = ang[y, x] * 180.0f / (float)Math.PI; + if (angDeg[y, x] < 0) angDeg[y, x] += 180; + } + + // Direction quantization + // 0°, 45°, 90°, 135° + + for (int y = 1; y < H - 1; y++) + { + for (int x = 1; x < W - 1; x++) + { + float angle = angDeg[y, x]; + float m0 = mag[y, x]; + float m1 = 0, m2 = 0; + + // Helper function: compares with two neighbors in given direction + if ((angle >= 0 && angle < 22.5) || (angle >= 157.5 && angle <= 180)) + { + m1 = mag[y, x - 1]; + m2 = mag[y, x + 1]; + } + else if (angle >= 22.5 && angle < 67.5) + { + m1 = mag[y - 1, x + 1]; + m2 = mag[y + 1, x - 1]; + } + else if (angle >= 67.5 && angle < 112.5) + { + m1 = mag[y - 1, x]; + m2 = mag[y + 1, x]; + } + else if (angle >= 112.5 && angle < 157.5) + { + m1 = mag[y - 1, x - 1]; + m2 = mag[y + 1, x + 1]; + } + + if (m0 >= m1 && m0 >= m2) + Z[y, x] = m0; + } + } + return Z; + } + + /// + /// Hysteresis thresholding + /// + public static byte[,] HysteresisThresholding(float[,] img, float low, float high) + { + int H = img.GetLength(0); + int W = img.GetLength(1); + byte strongVal = 255; + byte weakVal = 50; + byte[,] result = new byte[H, W]; + + for (int y = 0; y < H; y++) + for (int x = 0; x < W; x++) + { + if (img[y, x] >= high) + result[y, x] = strongVal; + else if (img[y, x] >= low) + result[y, x] = weakVal; + } + + bool changed = true; + while (changed) + { + changed = false; + + // Neighborhood of a strong pixel: + // 8-neighborhood via shifts + // Weak pixels that border strong become strong + for (int y = 1; y < H - 1; y++) + { + for (int x = 1; x < W - 1; x++) + { + if (result[y, x] == weakVal) + { + bool hasStrong = false; + for (int dy = -1; dy <= 1; dy++) + { + for (int dx = -1; dx <= 1; dx++) + { + if (dx == 0 && dy == 0) continue; + if (result[y + dy, x + dx] == strongVal) + { + hasStrong = true; + break; + } + } + if (hasStrong) break; + } + if (hasStrong) + { + result[y, x] = strongVal; + changed = true; + } + } + } + } + } + + for (int y = 0; y < H; y++) + for (int x = 0; x < W; x++) + if (result[y, x] != strongVal) + result[y, x] = 0; + + return result; + } + + /// + /// Full Canny Edge Detector. + /// + /// Input image. + /// Low threshold. + /// High threshold. + /// Edge image. + public static byte[,] CannyNumPy(byte[,] img, float low = 50.0f, float high = 100.0f) + { + float[,] imgFloat = new float[img.GetLength(0), img.GetLength(1)]; + for (int y = 0; y < img.GetLength(0); y++) + for (int x = 0; x < img.GetLength(1); x++) + imgFloat[y, x] = img[y, x]; + + float[,] blur = GaussianBlur(imgFloat, 5, 1.0f); + byte[,] blurByte = new byte[blur.GetLength(0), blur.GetLength(1)]; + for (int y = 0; y < blur.GetLength(0); y++) + for (int x = 0; x < blur.GetLength(1); x++) + blurByte[y, x] = (byte)Math.Max(0, Math.Min(255, blur[y, x])); + + var (mag, ang) = SobelGradients(blurByte); + float[,] nms = NonMaxSuppression(mag, ang); + byte[,] edges = HysteresisThresholding(nms, low, high); + return edges; + } + + /// + /// Edge density check: mean(edges)/255.0. + /// + /// Input edge image (0/255). + /// Density threshold. + /// Density value and pass/fail status. + public static (double density, bool passed) EdgeDensityCheck(byte[,] edges, double threshold = 0.2) + { + int H = edges.GetLength(0); + int W = edges.GetLength(1); + long sum = 0; + for (int y = 0; y < H; y++) + for (int x = 0; x < W; x++) + sum += edges[y, x]; + double density = sum / (255.0 * H * W); + return (density, density >= threshold); + } + + /// + /// Runs all four checks and calculates weighted score. + /// + /// Input 2D byte array (grayscale). + /// Dictionary with analysis results. + public static Dictionary AnalyzeImage(byte[,] imgGray) + { + // 1) Entropy + var (entropyVal, entropyOk) = EntropyCheck(imgGray); + + // 2) FFT ratio + var (fftRatio, fftOk) = FftCheck(imgGray); + + // 3) Components + byte[,] binary = OtsuThreshold(imgGray); + var (componentsCnt, componentsOk) = ComponentsCheck(binary); + + // 4) Edges + byte[,] edges = CannyNumPy(imgGray); + var (edgeDensity, edgesOk) = EdgeDensityCheck(edges); + + // Weighted score + int score = 0; + if (componentsOk) score += 2; + if (edgesOk) score += 2; + if (entropyOk) score += 1; + if (fftOk) score += 1; + + return new Dictionary + { + ["entropy"] = (entropyVal, entropyOk), + ["fft_ratio"] = (fftRatio, fftOk), + ["components"] = (componentsCnt, componentsOk), + ["edge_density"] = (edgeDensity, edgesOk), + ["score"] = (score, false), + }; + } + } +} diff --git a/MuPDF.NET4LLM/helpers/MuPdfRag.cs b/MuPDF.NET4LLM/helpers/MuPdfRag.cs new file mode 100644 index 0000000..3994783 --- /dev/null +++ b/MuPDF.NET4LLM/helpers/MuPdfRag.cs @@ -0,0 +1,1619 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using MuPDF.NET; + +namespace MuPDF.NET4LLM.Helpers +{ + /// + /// Header identification based on font sizes + /// + public class IdentifyHeaders + { + private Dictionary _headerId = new Dictionary(); + private float _bodyLimit; + + public IdentifyHeaders( + object doc, // Document or string + List pages = null, + float bodyLimit = 12.0f, // Force this to be body text + int maxLevels = 6) // Accept this many header levels + { + if (maxLevels < 1 || maxLevels > 6) + throw new ArgumentException("max_levels must be between 1 and 6"); + + Document mydoc = doc as Document; + if (mydoc == null) + { + mydoc = new Document(doc.ToString()); + } + + // Remove StructTreeRoot to avoid possible performance degradation + // We will not use the structure tree anyway. + if (pages == null) // Use all pages if omitted + pages = Enumerable.Range(0, mydoc.PageCount).ToList(); + + Dictionary fontSizes = new Dictionary(); + + foreach (int pno in pages) + { + Page page = mydoc.LoadPage(pno); + // Use TEXTFLAGS_TEXT for proper text extraction (matches Python TEXTFLAGS_TEXT) + int textFlags = (int)TextFlagsExtension.TEXTFLAGS_TEXT; + TextPage textPage = page.GetTextPage(flags: textFlags); + PageInfo pageInfo = textPage.ExtractDict(null, false); + + // Look at all non-empty horizontal spans + foreach (var block in pageInfo.Blocks ?? new List()) + { + if (block.Type != 0) continue; + if (block.Lines == null) continue; + + foreach (var line in block.Lines) + { + if (line.Spans == null) continue; + foreach (var span in line.Spans) + { + string text = span.Text ?? ""; + if (Utils.IsWhite(text)) continue; + + int fontSz = (int)Math.Round(span.Size); // Compute rounded fontsize + if (!fontSizes.ContainsKey(fontSz)) + fontSizes[fontSz] = 0; + fontSizes[fontSz] += text.Trim().Length; // Add character count + } + } + } + + textPage.Dispose(); + page.Dispose(); + } + + if (mydoc != doc as Document) + // If opened here, close it now + mydoc.Close(); + + // Maps a fontsize to a string of multiple # header tag characters + // If not provided, choose the most frequent font size as body text. + // If no text at all on all pages, just use body_limit. + // In any case all fonts not exceeding + var sorted = fontSizes.OrderBy(kvp => (kvp.Value, kvp.Key)).ToList(); + if (sorted.Count > 0) + { + // Most frequent font size + _bodyLimit = Math.Max(bodyLimit, sorted[sorted.Count - 1].Key); + } + else + { + _bodyLimit = bodyLimit; + } + + // Identify up to 6 font sizes as header candidates + var sizes = fontSizes.Keys + .Where(f => f > _bodyLimit) + .OrderByDescending(f => f) + .Take(maxLevels) + .ToList(); + + // Make the header tag dictionary + for (int i = 0; i < sizes.Count; i++) + { + _headerId[sizes[i]] = new string('#', i + 1) + " "; + } + + if (_headerId.Count > 0) + _bodyLimit = _headerId.Keys.Min() - 1; + } + + /// + /// Return appropriate markdown header prefix. + /// Given a text span from a "dict"/"rawdict" extraction, determine the + /// markdown header prefix string of 0 to n concatenated '#' characters. + /// + public string GetHeaderId(ExtendedSpan span, Page page = null) + { + int fontsize = (int)Math.Round(span.Size); // Compute fontsize + if (fontsize <= _bodyLimit) + return ""; + string hdrId = _headerId.ContainsKey(fontsize) ? _headerId[fontsize] : ""; + return hdrId; + } + } + + /// + /// Header identification based on Table of Contents + /// + public class TocHeaders + { + private List _toc; + + /// + /// Read and store the TOC of the document. + /// + public TocHeaders(object doc) + { + Document mydoc = doc as Document; + if (mydoc == null) + { + mydoc = new Document(doc.ToString()); + } + + _toc = mydoc.GetToc(); + + if (mydoc != doc as Document) + // If opened here, close it now + mydoc.Close(); + } + + /// + /// Return appropriate markdown header prefix. + /// Given a text span from a "dict"/"rawdict" extraction, determine the + /// markdown header prefix string of 0 to n concatenated '#' characters. + /// + public string GetHeaderId(ExtendedSpan span, Page page = null) + { + if (page == null) + return ""; + // Check if this page has TOC entries with an actual title + var myToc = _toc.Where(t => !string.IsNullOrEmpty(t.Title) && t.Page == page.Number + 1).ToList(); + if (myToc.Count == 0) // No TOC items present on this page + return ""; + // Check if the span matches a TOC entry. This must be done in the + // most forgiving way: exact matches are rare animals. + string text = (span.Text ?? "").Trim(); // Remove leading and trailing whitespace + foreach (var t in myToc) + { + string title = t.Title.Trim(); // Title of TOC entry + int lvl = t.Level; // Level of TOC entry + if (text.StartsWith(title) || title.StartsWith(text)) + { + // Found a match: return the header tag + return new string('#', lvl) + " "; + } + } + return ""; + } + } + + /// + /// Parameters class to store page-specific information (matches Python dataclass) + /// + public class Parameters + { + public Page Page { get; set; } + public string Filename { get; set; } + public string MdString { get; set; } = ""; + public List Images { get; set; } = new List(); + public List Tables { get; set; } = new List(); + public List Graphics { get; set; } = new List(); + public List Words { get; set; } = new List(); + public List LineRects { get; set; } = new List(); + public bool AcceptInvisible { get; set; } + public float[] BgColor { get; set; } + public Rect Clip { get; set; } + public List Links { get; set; } = new List(); + public List AnnotRects { get; set; } = new List(); + public TextPage TextPage { get; set; } + public List ImgRects { get; set; } = new List(); + public List TabRects0 { get; set; } = new List(); + public Dictionary TabRects { get; set; } = new Dictionary(); + public List
Tabs { get; set; } = new List
(); + public List WrittenTables { get; set; } = new List(); + public List WrittenImages { get; set; } = new List(); + public List ActualPaths { get; set; } = new List(); + public List VgClusters0 { get; set; } = new List(); + public Dictionary VgClusters { get; set; } = new Dictionary(); + } + + /// + /// Main markdown conversion utilities. + /// Ported and adapted from the Python module helpers/pymupdf_rag.py in pymupdf4llm. + /// + public static class MuPdfRag + { + private const string GRAPHICS_TEXT = "\n![]({0})\n"; + + /// + /// Convert a document to Markdown, closely following the behavior of + /// pymupdf4llm.helpers.pymupdf_rag.ToMarkdown. + /// + /// Input to convert. + /// + /// Page numbers (0‑based) to process. When null, all pages are processed. + /// + /// + /// Optional header resolver used to create Markdown headings. This can be + /// an instance, a instance, + /// or null to auto‑detect headers. + /// + /// + /// When true, images are written to disk and referenced by relative path. + /// + /// + /// When true, images are embedded as data: URLs in the Markdown. + /// Cannot be combined with . + /// + /// + /// When true, image regions are ignored entirely (no image and no OCR text). + /// + /// + /// When true, vector graphics are ignored (no layout‑based table / column hints). + /// + /// + /// When true, tries to detect a uniform page background to filter + /// out large background rectangles from graphics analysis. + /// + /// + /// Target directory for written images when is true. + /// + /// Image file format, e.g. "png" or "jpg". + /// + /// Minimum relative size (\(0 \leq v < 1\)) of images with respect to the page + /// before they are considered for output. + /// + /// + /// Logical filename used in image names and metadata; defaults to . + /// + /// + /// When true, attempts to also extract text from image regions (e.g. diagrams) + /// in addition to placing images. + /// + /// + /// When true, returns a JSON string describing per‑page “chunks” instead of raw Markdown. + /// + /// + /// When true, appends an explicit --- end of page=... marker after each page. + /// + /// + /// Optional margins in points. One value applies to all sides, two values to + /// top/bottom and left/right, and four values to left, top, right, bottom. + /// + /// + /// Resolution used for image extraction where a is rendered. + /// + /// + /// Page width used for reflowable documents when is true. + /// + /// + /// Optional page height for reflowable documents. If null, a single tall page + /// covering the whole document is created. + /// + /// + /// Table detection strategy passed to Page.GetTables, e.g. "lines_strict" + /// to mimic the Python default. + /// + /// + /// Optional upper bound on the number of path objects before graphics are ignored + /// for layout analysis (similar to graphics_limit in Python). + /// + /// + /// Minimum font size considered as “normal” text when computing some heuristics. + /// + /// + /// When true, code blocks (mono‑spaced text) are not emitted as fenced code blocks. + /// + /// + /// When true, the return value is a JSON description of page “chunks” with + /// word positions, matching the Python extract_words mode. + /// + /// + /// When true, prints a simple progress bar while processing pages. + /// + /// + /// When true, uses glyph IDs for unknown Unicode characters, similar to + /// FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE in the C API. + /// + /// + /// When true, treats fully transparent text as visible (affects OCR heuristics). + /// + /// + /// Markdown text for the selected pages, or a JSON string describing page chunks + /// when / is enabled. + /// + public static string ToMarkdown( + Document doc, + List pages = null, + object hdrInfo = null, // Can be IdentifyHeaders, TocHeaders, or null + bool writeImages = false, + bool embedImages = false, + bool ignoreImages = false, + bool ignoreGraphics = false, + bool detectBgColor = true, + string imagePath = "", + string imageFormat = "png", + float imageSizeLimit = 0.05f, + string filename = null, + bool forceText = true, + bool pageChunks = false, + bool pageSeparators = false, + List margins = null, + int dpi = 150, + float pageWidth = 612, + float? pageHeight = null, + string tableStrategy = "lines_strict", + int? graphicsLimit = null, + float fontsizeLimit = 3.0f, + bool ignoreCode = false, + bool extractWords = false, + bool showProgress = false, + bool useGlyphs = false, + bool ignoreAlpha = false) + { + if (!writeImages && !embedImages && !forceText) + throw new ArgumentException("Images and text on images cannot both be suppressed."); + if (embedImages) + { + writeImages = false; + imagePath = string.Empty; + } + if (imageSizeLimit < 0 || imageSizeLimit >= 1) + { + throw new ArgumentOutOfRangeException( + nameof(imageSizeLimit), + "'imageSizeLimit' must be non-negative and less than 1."); + } + + int DPI = dpi; + bool IGNORE_CODE = ignoreCode; + string IMG_EXTENSION = imageFormat; + bool EXTRACT_WORDS = extractWords; + if (EXTRACT_WORDS) + { + pageChunks = true; + ignoreCode = true; + } + string IMG_PATH = imagePath; + if (!string.IsNullOrEmpty(IMG_PATH) && writeImages && !Directory.Exists(IMG_PATH)) + Directory.CreateDirectory(IMG_PATH); + + string FILENAME = filename ?? doc.Name; + // Assign configuration + int? GRAPHICS_LIMIT = graphicsLimit; + double FONTSIZE_LIMIT = fontsizeLimit; + bool IGNORE_IMAGES = ignoreImages; + bool IGNORE_GRAPHICS = ignoreGraphics; + bool DETECT_BG_COLOR = detectBgColor; + + if (filename == null) + filename = doc.Name; + + // Handle form PDFs and documents with annotations + if (doc.IsFormPDF > 0 || (doc.IsPDF && doc.HasAnnots())) + { + doc.Bake(); + } + + // For reflowable documents, allow making 1 page for the whole document + if (doc.IsReflowable) + { + if (pageHeight.HasValue) + { + // Accept user page dimensions + doc.SetLayout(width: pageWidth, height: pageHeight.Value); + } + else + { + // No page height limit given: make 1 page for whole document + doc.SetLayout(width: pageWidth, height: 792); + int pageCount = doc.PageCount; + float height = 792 * pageCount; // Height that covers full document + doc.SetLayout(width: pageWidth, height: height); + } + } + + if (pages == null) // Use all pages if no selection given + pages = Enumerable.Range(0, doc.PageCount).ToList(); + + // Process margins: convert to 4-element list + if (margins == null) + margins = new List { 0, 0, 0, 0 }; + else if (margins.Count == 1) + margins = new List { margins[0], margins[0], margins[0], margins[0] }; + else if (margins.Count == 2) + margins = new List { 0, margins[0], 0, margins[1] }; + else if (margins.Count != 4) + throw new ArgumentException("margins must be one, two or four floats"); + + // If "hdr_info" is not an object with a method "get_header_id", scan the + // document and use font sizes as header level indicators. + Func getHeaderId; + + if (hdrInfo is IdentifyHeaders idHdr) + getHeaderId = idHdr.GetHeaderId; + else if (hdrInfo is TocHeaders tocHdr) + getHeaderId = tocHdr.GetHeaderId; + else if (hdrInfo == null) + { + var idHdr2 = new IdentifyHeaders(doc, pages); + getHeaderId = idHdr2.GetHeaderId; + } + else + getHeaderId = (s, p) => ""; + + // Initialize output based on page_chunks mode + object documentOutput; + if (!pageChunks) + { + documentOutput = new StringBuilder(); + } + else + { + documentOutput = new List>(); + } + + // Read the Table of Contents + List toc = doc.GetToc(); + + // Text extraction flags: omit clipped text, collect styles + int textFlags = (int)TextFlags.TEXT_MEDIABOX_CLIP | + (int)mupdf.mupdf.FZ_STEXT_COLLECT_STYLES; + + // Optionally replace REPLACEMENT_CHARACTER by glyph number + if (useGlyphs) + { + textFlags |= (int)mupdf.mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE; + } + + // Note: Table FLAGS would be set here if we had access to pymupdf.table.FLAGS + // In C#, this would need to be handled differently if table extraction uses flags + + var progressBar = showProgress && pages.Count > 5 + ? ProgressBar.Create(pages.Cast().ToList()) + : null; + + try + { + if (showProgress) + { + Console.WriteLine($"Processing {FILENAME}..."); + } + + foreach (int pno in pages) + { + if (progressBar != null && !progressBar.MoveNext()) + break; + + Parameters pageParms = GetPageOutput( + doc, pno, margins, getHeaderId, writeImages, embedImages, ignoreImages, + imagePath, imageFormat, filename, forceText, dpi, ignoreCode, + ignoreGraphics, tableStrategy, detectBgColor, graphicsLimit, + ignoreAlpha, extractWords, pageSeparators, imageSizeLimit, textFlags); + + if (!pageChunks) + { + ((StringBuilder)documentOutput).Append(pageParms.MdString); + } + else + { + // Build subset of TOC for this page + var pageTocs = toc.Where(t => t.Page == pno + 1).ToList(); + + var metadata = GetMetadata(doc, pno, FILENAME); + + var pageChunk = new Dictionary + { + ["metadata"] = metadata, + ["toc_items"] = pageTocs, + ["tables"] = pageParms.Tables, + ["images"] = pageParms.Images, + ["graphics"] = pageParms.Graphics, + ["text"] = pageParms.MdString, + ["words"] = pageParms.Words + }; + + ((List>)documentOutput).Add(pageChunk); + } + } + } + finally + { + progressBar?.Dispose(); + } + + if (!pageChunks) + { + return ((StringBuilder)documentOutput).ToString(); + } + else + { + // For page_chunks mode, we need to return a structured format + // Since System.Text.Json may not be available in all .NET versions, + // we'll use Newtonsoft.Json if available, or return a simple string representation + try + { + return Newtonsoft.Json.JsonConvert.SerializeObject(documentOutput, Newtonsoft.Json.Formatting.Indented); + } + catch + { + // Fallback: return a simple string representation + var sb = new StringBuilder(); + foreach (var chunk in (List>)documentOutput) + { + sb.AppendLine("--- Page Chunk ---"); + foreach (var kvp in chunk) + { + sb.AppendLine($"{kvp.Key}: {kvp.Value}"); + } + sb.AppendLine(); + } + return sb.ToString(); + } + } + } + + /// + /// Get maximum header ID from spans (matches Python max_header_id) + /// + private static string MaxHeaderId( + List spans, + Page page, + Func getHeaderId) + { + var hdrIds = spans + .Select(s => getHeaderId(s, page)) + .Where(h => !string.IsNullOrEmpty(h)) + .Select(h => h.Trim().Length) + .Where(l => l > 0) + .Distinct() + .OrderBy(l => l) + .ToList(); + + if (hdrIds.Count == 0) + return ""; + + // Return header tag with one less '#' than the minimum found + return new string('#', hdrIds[0] - 1) + " "; + } + + /// + /// Accept a span and return a markdown link string. + /// A link should overlap at least 70% of the span. + /// + private static string ResolveLinks(List links, ExtendedSpan span) + { + if (links == null || links.Count == 0 || span == null || span.Bbox == null) + return null; + + Rect spanBbox = span.Bbox; // Span bbox + + foreach (var link in links) + { + // Only process URI links + if (link.Kind != LinkType.LINK_URI || string.IsNullOrEmpty(link.Uri)) + continue; + + if (link.From == null) + continue; + + // The hot area of the link + // Middle point of hot area + float middleX = (link.From.TopLeft.X + link.From.BottomRight.X) / 2; + float middleY = (link.From.TopLeft.Y + link.From.BottomRight.Y) / 2; + + // Does not touch the bbox + if (!(middleX >= spanBbox.X0 && middleX <= spanBbox.X1 && + middleY >= spanBbox.Y0 && middleY <= spanBbox.Y1)) + continue; + + string text = (span.Text ?? "").Trim(); + if (!string.IsNullOrEmpty(text)) + { + return $"[{text}]({link.Uri})"; + } + } + + return null; + } + + /// + /// Optionally render the rect part of a page. + /// We will ignore images that are empty or that have an edge smaller + /// than x% of the corresponding page edge. + /// + private static string SaveImage( + Page page, + Rect rect, + int imageIndex, + bool writeImages, + bool embedImages, + string imagePath, + string imageFormat, + string filename, + int dpi, + float imageSizeLimit) + { + // Check if image is too small + if (rect.Width < page.Rect.Width * imageSizeLimit || + rect.Height < page.Rect.Height * imageSizeLimit) + { + return ""; + } + + if (!writeImages && !embedImages) + return ""; + + Pixmap pix = page.GetPixmap(clip: rect, dpi: dpi); + try + { + if (pix.H <= 0 || pix.W <= 0) + return ""; + + if (writeImages) + { + // Ensure image path exists + if (!string.IsNullOrEmpty(imagePath) && !Directory.Exists(imagePath)) + { + Directory.CreateDirectory(imagePath); + } + + string safeFilename = Path.GetFileName(filename ?? "document").Replace(" ", "-"); + string imageFilename = string.IsNullOrEmpty(imagePath) + ? $"{safeFilename}-{page.Number}-{imageIndex}.{imageFormat}" + : Path.Combine(imagePath, $"{safeFilename}-{page.Number}-{imageIndex}.{imageFormat}"); + pix.Save(imageFilename); + return imageFilename.Replace("\\", "/"); + } + else if (embedImages) + { + // Make a base64 encoded string of the image + byte[] imageBytes = pix.ToBytes(imageFormat); + string base64 = Convert.ToBase64String(imageBytes); + return $"data:image/{imageFormat};base64,{base64}"; + } + } + finally + { + pix.Dispose(); + } + + return ""; + } + + /// + /// Check if page exclusively contains OCR text. + /// For this to be true, all text must be written as "ignore-text". + /// + private static bool PageIsOcr(Page page) + { + try + { + var bboxLog = page.GetBboxlog(); + var textTypes = new HashSet(bboxLog + .Where(b => b.Type != null && b.Type.Contains("text")) + .Select(b => b.Type) + .Distinct()); + + return textTypes.Count == 1 && textTypes.Contains("ignore-text"); + } + catch + { + return false; + } + } + + /// + /// Get metadata for a page (matches Python get_metadata) + /// + private static Dictionary GetMetadata(Document doc, int pno, string filename) + { + var meta = new Dictionary(); + if (doc.MetaData != null) + { + foreach (var kvp in doc.MetaData) + { + meta[kvp.Key] = kvp.Value; + } + } + meta["file_path"] = filename; + meta["page_count"] = doc.PageCount; + meta["page"] = pno + 1; + return meta; + } + + /// + /// Reorder words in lines. + /// The argument list must be presorted by bottom, then left coordinates. + /// Words with similar top / bottom coordinates are assumed to belong to + /// the same line and will be sorted left to right within that line. + /// + private static List SortWords(List words) + { + if (words == null || words.Count == 0) + return new List(); + + List nwords = new List(); + List line = new List { words[0] }; + Rect lrect = new Rect(words[0].X0, words[0].Y0, words[0].X1, words[0].Y1); + + for (int i = 1; i < words.Count; i++) + { + var word = words[i]; + var wrect = new Rect(word.X0, word.Y0, word.X1, word.Y1); + if (Math.Abs(wrect.Y0 - lrect.Y0) <= 3 || Math.Abs(wrect.Y1 - lrect.Y1) <= 3) + { + line.Add(word); + lrect = Utils.JoinRects(new List { lrect, wrect }); + } + else + { + line = line.OrderBy(w => w.X0).ToList(); + nwords.AddRange(line); + line = new List { word }; + lrect = new Rect(word.X0, word.Y0, word.X1, word.Y1); + } + } + + line = line.OrderBy(w => w.X0).ToList(); + nwords.AddRange(line); + return nwords; + } + + /// + /// Output tables above given text rectangle (matches Python output_tables) + /// + private static string OutputTables(Parameters parms, Rect textRect, bool extractWords) + { + StringBuilder thisMd = new StringBuilder(); // Markdown string for table(s) content + + if (textRect != null) // Select tables above the text block + { + var tabCandidates = parms.TabRects + .Where(kvp => kvp.Value.Y1 <= textRect.Y0 && !parms.WrittenTables.Contains(kvp.Key) && + (textRect.X0 <= kvp.Value.X0 && kvp.Value.X0 < textRect.X1 || + textRect.X0 < kvp.Value.X1 && kvp.Value.X1 <= textRect.X1 || + kvp.Value.X0 <= textRect.X0 && textRect.X1 <= kvp.Value.X1)) + .OrderBy(kvp => kvp.Value.Y1) + .ThenBy(kvp => kvp.Value.X0) + .ToList(); + + foreach (var kvp in tabCandidates) + { + int i = kvp.Key; + thisMd.Append("\n" + parms.Tabs[i].ToMarkdown(clean: false) + "\n"); + + if (extractWords) + { + // For "words" extraction, add table cells as line rects + var cells = new List(); + if (parms.Tabs[i].header != null && parms.Tabs[i].header.cells != null) + { + foreach (var c in parms.Tabs[i].header.cells) + { + if (c != null) + cells.Add(c); + } + } + if (parms.Tabs[i].cells != null) + { + foreach (var c in parms.Tabs[i].cells) + { + if (c != null) + cells.Add(c); + } + } + cells = cells.Distinct() + .OrderBy(c => c.Y1) + .ThenBy(c => c.X0) + .ToList(); + parms.LineRects.AddRange(cells); + } + parms.WrittenTables.Add(i); // Do not touch this table twice + } + } + else // Output all remaining tables + { + foreach (var kvp in parms.TabRects) + { + int i = kvp.Key; + if (parms.WrittenTables.Contains(i)) + continue; + + thisMd.Append("\n" + parms.Tabs[i].ToMarkdown(clean: false) + "\n"); + + if (extractWords) + { + // For "words" extraction, add table cells as line rects + var cells = new List(); + if (parms.Tabs[i].header != null && parms.Tabs[i].header.cells != null) + { + foreach (var c in parms.Tabs[i].header.cells) + { + if (c != null) + cells.Add(c); + } + } + if (parms.Tabs[i].cells != null) + { + foreach (var c in parms.Tabs[i].cells) + { + if (c != null) + cells.Add(c); + } + } + cells = cells.Distinct() + .OrderBy(c => c.Y1) + .ThenBy(c => c.X0) + .ToList(); + parms.LineRects.AddRange(cells); + } + parms.WrittenTables.Add(i); // Do not touch this table twice + } + } + + return thisMd.ToString(); + } + + /// + /// Output images and graphics above text rectangle (matches Python output_images) + /// + private static string OutputImages(Parameters parms, Rect textRect, bool forceText, + bool writeImages, bool embedImages, string imagePath, string imageFormat, + string filename, int dpi, float imageSizeLimit, Func getHeaderId) + { + if (parms.ImgRects == null || parms.ImgRects.Count == 0) + return ""; + + StringBuilder thisMd = new StringBuilder(); // Markdown string + + if (textRect != null) // Select images above the text block + { + for (int i = 0; i < parms.ImgRects.Count; i++) + { + if (parms.WrittenImages.Contains(i)) + continue; + + Rect imgRect = parms.ImgRects[i]; + if (imgRect.Y0 > textRect.Y0) + continue; + if (imgRect.X0 >= textRect.X1 || imgRect.X1 <= textRect.X0) + continue; + + string pathname = SaveImage(parms.Page, imgRect, i, writeImages, embedImages, + imagePath, imageFormat, filename, dpi, imageSizeLimit); + parms.WrittenImages.Add(i); // Do not touch this image twice + + if (!string.IsNullOrEmpty(pathname)) + { + thisMd.AppendFormat(GRAPHICS_TEXT, pathname); + } + + if (forceText) + { + // Recursive invocation + string imgTxt = WriteText(parms, imgRect, getHeaderId, forceText: true, + ignoreCode: false, extractWords: false); + if (!Utils.IsWhite(imgTxt)) // Was there text at all? + { + thisMd.Append(imgTxt); + } + } + } + } + else // Output all remaining images + { + for (int i = 0; i < parms.ImgRects.Count; i++) + { + if (parms.WrittenImages.Contains(i)) + continue; + + string pathname = SaveImage(parms.Page, parms.ImgRects[i], i, writeImages, embedImages, + imagePath, imageFormat, filename, dpi, imageSizeLimit); + parms.WrittenImages.Add(i); // Do not touch this image twice + + if (!string.IsNullOrEmpty(pathname)) + { + thisMd.AppendFormat(GRAPHICS_TEXT, pathname); + } + + if (forceText) + { + string imgTxt = WriteText(parms, parms.ImgRects[i], getHeaderId, forceText: true, + ignoreCode: false, extractWords: false); + if (!Utils.IsWhite(imgTxt)) + { + thisMd.Append(imgTxt); + } + } + } + } + + return thisMd.ToString(); + } + + /// + /// Output the text found inside the given clip. + /// This is an alternative for plain text in that it outputs + /// text enriched with markdown styling. + /// The logic is capable of recognizing headers, body text, code blocks, + /// inline code, bold, italic and bold-italic styling. + /// There is also some effort for list supported (ordered / unordered) in + /// that typical characters are replaced by respective markdown characters. + /// 'tables'/'images' indicate whether this execution should output these + /// objects. + /// + private static string WriteText(Parameters parms, Rect clip, + Func getHeaderId, bool forceText, bool ignoreCode, bool extractWords) + { + if (clip == null) + clip = parms.Clip; + + StringBuilder outString = new StringBuilder(); + + // This is a list of tuples (linerect, spanlist) + var nlines = GetTextLines.GetRawLines(parms.TextPage, null, clip, tolerance: 3, + ignoreInvisible: !parms.AcceptInvisible); + + // Filter out lines that intersect with tables + nlines = nlines + .Where(l => Utils.OutsideAllBboxes(l.Rect, parms.TabRects.Values)) + .ToList(); + + parms.LineRects.AddRange(nlines.Select(l => l.Rect)); // Store line rectangles + + Rect prevLrect = null; // Previous line rectangle + int prevBno = -1; // Previous block number of line + bool code = false; // Mode indicator: outputting code + string prevHdrString = null; + + foreach (var line in nlines) + { + Rect lrect = line.Rect; + var spans = line.Spans; + + // Skip if line intersects with images + if (!Utils.OutsideAllBboxes(lrect, parms.ImgRects)) + continue; + + // Pick up tables ABOVE this text block + var tabCandidates = parms.TabRects + .Where(kvp => kvp.Value.Y1 <= lrect.Y0 && !parms.WrittenTables.Contains(kvp.Key) && + (lrect.X0 <= kvp.Value.X0 && kvp.Value.X0 < lrect.X1 || + lrect.X0 < kvp.Value.X1 && kvp.Value.X1 <= lrect.X1 || + kvp.Value.X0 <= lrect.X0 && lrect.X1 <= kvp.Value.X1)) + .ToList(); + + foreach (var kvp in tabCandidates) + { + int i = kvp.Key; + outString.Append("\n" + parms.Tabs[i].ToMarkdown(clean: false) + "\n"); + + if (extractWords) + { + var cells = new List(); + if (parms.Tabs[i].header != null && parms.Tabs[i].header.cells != null) + { + foreach (var c in parms.Tabs[i].header.cells) + { + if (c != null) + cells.Add(c); + } + } + if (parms.Tabs[i].cells != null) + { + foreach (var c in parms.Tabs[i].cells) + { + if (c != null) + cells.Add(c); + } + } + parms.LineRects.AddRange(cells.OrderBy(c => c.Y1).ThenBy(c => c.X0)); + } + parms.WrittenTables.Add(i); + prevHdrString = null; + } + + // Pick up images/graphics ABOVE this text block + for (int i = 0; i < parms.ImgRects.Count; i++) + { + if (parms.WrittenImages.Contains(i)) + continue; + + Rect r = parms.ImgRects[i]; + if (Math.Max(r.Y0, lrect.Y0) < Math.Min(r.Y1, lrect.Y1) && + (lrect.X0 <= r.X0 && r.X0 < lrect.X1 || + lrect.X0 < r.X1 && r.X1 <= lrect.X1 || + r.X0 <= lrect.X0 && lrect.X1 <= r.X1)) + { + string pathname = SaveImage(parms.Page, r, i, false, false, "", "", "", 150, 0.05f); + if (!string.IsNullOrEmpty(pathname)) + { + outString.AppendFormat(GRAPHICS_TEXT, pathname); + } + + if (forceText) + { + string imgTxt = WriteText(parms, r, getHeaderId, forceText: true, + ignoreCode: false, extractWords: false); + if (!Utils.IsWhite(imgTxt)) + { + outString.Append(imgTxt); + } + } + parms.WrittenImages.Add(i); + prevHdrString = null; + } + } + + parms.LineRects.Add(lrect); + + // If line rect is far away from previous one, add line break + if (parms.LineRects.Count > 1) + { + var prevRect = parms.LineRects[parms.LineRects.Count - 2]; + if (lrect.Y1 - prevRect.Y1 > lrect.Height * 1.5f) + { + outString.Append("\n"); + } + } + + // Make text string for the full line + string text = string.Join(" ", spans.Select(s => s.Text ?? "").Where(t => !string.IsNullOrWhiteSpace(t))).Trim(); + + // Check formatting flags + bool allStrikeout = spans.All(s => ((int)s.CharFlags & 1) != 0); + bool allItalic = spans.All(s => ((int)s.Flags & 2) != 0); + bool allBold = spans.All(s => (((int)s.Flags & 16) != 0) || (((int)s.CharFlags & 8) != 0)); + bool allMono = spans.All(s => ((int)s.Flags & 8) != 0); + + // Get header string + string hdrString = MaxHeaderId(spans, parms.Page, getHeaderId); + + if (!string.IsNullOrEmpty(hdrString)) + { + // Header line + if (allMono) + text = "`" + text + "`"; + if (allItalic) + text = "_" + text + "_"; + if (allBold) + text = "**" + text + "**"; + if (allStrikeout) + text = "~~" + text + "~~"; + + if (hdrString != prevHdrString) + { + outString.Append(hdrString + text + "\n"); + } + else + { + // Header text broken across multiple lines + while (outString.Length > 0 && outString[outString.Length - 1] == '\n') + outString.Length--; + outString.Append(" " + text + "\n"); + } + prevHdrString = hdrString; + continue; + } + + prevHdrString = hdrString; + + // Start or extend code block + if (allMono && !ignoreCode) + { + if (!code) + { + outString.Append("```\n"); + code = true; + } + float delta = (lrect.X0 - clip.X0) / (spans[0].Size * 0.5f); + string indent = new string(' ', Math.Max(0, (int)delta)); + outString.Append(indent + text + "\n"); + continue; + } + + if (code && !allMono) + { + outString.Append("```\n"); + code = false; + } + + ExtendedSpan span0 = spans[0]; + int bno = span0.Block; + if (bno != prevBno) + { + outString.Append("\n"); + prevBno = bno; + } + + // Check if we need another line break + if ((prevLrect != null && lrect.Y1 - prevLrect.Y1 > lrect.Height * 1.5f) || + (span0.Text != null && (span0.Text.StartsWith("[") || Utils.StartswithBullet(span0.Text))) || + ((int)span0.Flags & 1) != 0) // Superscript + { + outString.Append("\n"); + } + prevLrect = lrect; + + // Switch off code mode if not all mono + if (code) + { + outString.Append("```\n"); + code = false; + } + + // Process each span + foreach (var s in spans) + { + bool mono = ((int)s.Flags & 8) != 0; + bool bold = ((int)s.Flags & 16) != 0 || ((int)s.CharFlags & 8) != 0; + bool italic = ((int)s.Flags & 2) != 0; + bool strikeout = ((int)s.CharFlags & 1) != 0; + + string prefix = ""; + string suffix = ""; + + if (mono) + { + prefix = "`" + prefix; + suffix += "`"; + } + if (bold) + { + prefix = "**" + prefix; + suffix += "**"; + } + if (italic) + { + prefix = "_" + prefix; + suffix += "_"; + } + if (strikeout) + { + prefix = "~~" + prefix; + suffix += "~~"; + } + + // Convert intersecting link to markdown syntax + string ltext = ResolveLinks(parms.Links, s); + if (ltext != null) + { + text = hdrString + prefix + ltext + suffix + " "; + } + else + { + text = hdrString + prefix + (s.Text ?? "").Trim() + suffix + " "; + } + + if (Utils.StartswithBullet(text)) + { + text = "- " + text.Substring(1); + text = text.Replace(" ", " "); + float dist = span0.Bbox.X0 - clip.X0; + float cwidth = (span0.Bbox.X1 - span0.Bbox.X0) / Math.Max(1, (span0.Text ?? "").Length); + if (cwidth == 0.0f) + cwidth = span0.Size * 0.5f; + int indentCount = (int)Math.Round(dist / cwidth); + text = new string(' ', Math.Max(0, indentCount)) + text; + } + + outString.Append(text); + } + + if (!code) + outString.Append("\n"); + } + + outString.Append("\n"); + if (code) + { + outString.Append("```\n"); + code = false; + } + outString.Append("\n\n"); + + string result = outString.ToString(); + result = result.Replace(" \n", "\n").Replace(" ", " "); + while (result.Contains("\n\n\n")) + result = result.Replace("\n\n\n", "\n\n"); + + return result; + } + + private static Parameters GetPageOutput( + Document doc, + int pno, + List margins, + Func getHeaderId, + bool writeImages, + bool embedImages, + bool ignoreImages, + string imagePath, + string imageFormat, + string filename, + bool forceText, + int dpi, + bool ignoreCode, + bool ignoreGraphics, + string tableStrategy, + bool detectBgColor, + int? graphicsLimit, + bool ignoreAlpha, + bool extractWords, + bool pageSeparators, + float imageSizeLimit, + int textFlags) + { + Page page = doc[pno]; + // Remove rotation to ensure we work on rotation=0 + page.RemoveRotation(); + + // Create Parameters object to store page information + Parameters parms = new Parameters + { + Page = page, + Filename = filename, + MdString = "", + Images = new List(), + Tables = new List(), + Graphics = new List(), + Words = new List(), + LineRects = new List(), + AcceptInvisible = PageIsOcr(page) || ignoreAlpha + }; + + // Determine background color + if (detectBgColor) + { + parms.BgColor = Utils.GetBgColor(page); + } + + // Process margins + float left = 0, top = 0, right = 0, bottom = 0; + if (margins != null && margins.Count > 0) + { + if (margins.Count == 1) + { + left = top = right = bottom = margins[0]; + } + else if (margins.Count == 2) + { + top = bottom = margins[0]; + left = right = margins[1]; + } + else if (margins.Count == 4) + { + left = margins[0]; + top = margins[1]; + right = margins[2]; + bottom = margins[3]; + } + } + + // Set clip with margins: page.rect + (left, top, -right, -bottom) + parms.Clip = new Rect(page.Rect); + parms.Clip.X0 += left; + parms.Clip.Y0 += top; + parms.Clip.X1 -= right; + parms.Clip.Y1 -= bottom; + + // Extract external links on page + parms.Links = page.GetLinks() + .Where(l => l.Kind == LinkType.LINK_URI && !string.IsNullOrEmpty(l.Uri)) + .ToList(); + + // Extract annotation rectangles on page + try + { + var annots = page.GetAnnots(); + parms.AnnotRects = annots + .Where(a => a.Rect != null) + .Select(a => a.Rect) + .ToList(); + } + catch + { + parms.AnnotRects = new List(); + } + + // Make a TextPage for all later extractions (textFlags passed from ToMarkdown) + parms.TextPage = page.GetTextPage(flags: textFlags, clip: parms.Clip); + + // Extract and process tables if not ignoring graphics + List
tables = new List
(); + Dictionary tabRects = new Dictionary(); + List writtenTables = new List(); + + if (!ignoreGraphics && !string.IsNullOrEmpty(tableStrategy)) + { + try + { + var foundTables = page.GetTables(clip: page.Rect, strategy: tableStrategy); + for (int i = 0; i < foundTables.Count; i++) + { + var t = foundTables[i]; + // Remove tables with too few rows or columns + if (t.row_count < 2 || t.col_count < 2) + continue; + tables.Add(t); + // Combine table bbox with header bbox + Rect tabRect = t.bbox; + if (t.header != null && t.header.bbox != null) + { + Rect headerRect = t.header.bbox; + tabRect = Utils.JoinRects(new List { tabRect, headerRect }); + } + tabRects[tables.Count - 1] = tabRect; + } + // Sort tables by position (top to bottom, left to right) + var sortedIndices = Enumerable.Range(0, tables.Count) + .OrderBy(i => tabRects[i].Y0) + .ThenBy(i => tabRects[i].X0) + .ToList(); + var sortedTables = sortedIndices.Select(i => tables[i]).ToList(); + var sortedRects = sortedIndices.ToDictionary( + idx => sortedIndices.IndexOf(idx), + idx => tabRects[idx] + ); + tables = sortedTables; + tabRects = sortedRects; + } + catch + { + // If table extraction fails, continue without tables + } + } + + + // Extract and process images if not ignored + List imgRects = new List(); + if (!ignoreImages) + { + try + { + List imgInfo = page.GetImageInfo(); + + // Filter and process images + var validImages = imgInfo + .Where(img => img.Bbox != null) + .Select(img => new { Bbox = new Rect(img.Bbox), Block = img }) + .Where(img => + img.Bbox.Width >= imageSizeLimit * page.Rect.Width && + img.Bbox.Height >= imageSizeLimit * page.Rect.Height && + img.Bbox.Intersects(page.Rect) && + img.Bbox.Width > 3 && + img.Bbox.Height > 3) + .OrderByDescending(img => Math.Abs(img.Bbox.Width * img.Bbox.Height)) + .Take(30) // Limit to 30 largest images + .ToList(); + + // Remove images contained in larger images + for (int i = validImages.Count - 1; i > 0; i--) + { + Rect r = validImages[i].Bbox; + if (r.IsEmpty) + { + validImages.RemoveAt(i); + continue; + } + for (int j = 0; j < i; j++) + { + if (Utils.BboxInBbox(r, validImages[j].Bbox)) + { + validImages.RemoveAt(i); + break; + } + } + } + + parms.ImgRects = validImages.Select(img => img.Bbox).ToList(); + parms.Images = validImages.Select(img => (object)img.Block).ToList(); + } + catch + { + // If image extraction fails, continue without images + } + } + else + { + parms.ImgRects = new List(); + } + + // Store tables in parms + parms.Tabs = tables; + parms.TabRects = tabRects; + parms.WrittenTables = writtenTables; + parms.TabRects0 = tabRects.Values.ToList(); + + // Check graphics limit and set too_many_graphics flag + bool tooManyGraphics = false; + int graphicsCount = 0; + if (!ignoreGraphics && graphicsLimit.HasValue) + { + try + { + var bboxLog = page.GetBboxlog(); + graphicsCount = bboxLog.Count(b => b.Type != null && b.Type.Contains("path")); + if (graphicsCount > graphicsLimit.Value) + { + ignoreGraphics = true; + tooManyGraphics = true; + } + } + catch + { + // If bboxlog extraction fails, continue + } + } + + // Get paths for graphics and multi-column detection + List paths = new List(); + List vgClusters0 = new List(); + + if (!ignoreGraphics) + { + try + { + paths = page.GetDrawings() + .Where(p => p.Rect != null && + Utils.BboxInBbox(p.Rect, parms.Clip) && + p.Rect.Width < parms.Clip.Width && + p.Rect.Height < parms.Clip.Height && + (p.Rect.Width > 3 || p.Rect.Height > 3) && + !(p.Type == "f" && p.Fill != null && parms.BgColor != null && + p.Fill.Length >= 3 && parms.BgColor.Length >= 3 && + Math.Abs(p.Fill[0] - parms.BgColor[0]) < 0.01f && + Math.Abs(p.Fill[1] - parms.BgColor[1]) < 0.01f && + Math.Abs(p.Fill[2] - parms.BgColor[2]) < 0.01f) && + Utils.OutsideAllBboxes(p.Rect, parms.TabRects0) && + Utils.OutsideAllBboxes(p.Rect, parms.AnnotRects)) + .ToList(); + + // Cluster drawings + if (paths.Count > 0) + { + var clusters = page.ClusterDrawings(clip: parms.Clip, drawings: paths); + foreach (var bbox in clusters) + { + if (Utils.IsSignificant(bbox, paths)) + { + vgClusters0.Add(bbox); + } + } + + // Get paths that are in significant graphics + parms.ActualPaths = paths + .Where(p => Utils.BboxInAnyBbox(p.Rect, vgClusters0)) + .ToList(); + } + } + catch + { + paths = new List(); + } + } + + // Also add image rectangles to the list and vice versa + vgClusters0.AddRange(parms.ImgRects); + parms.ImgRects.AddRange(vgClusters0); + parms.ImgRects = parms.ImgRects + .Distinct() + .OrderBy(r => r.Y1) + .ThenBy(r => r.X0) + .ToList(); + parms.WrittenImages = new List(); + + // Refine graphics clusters + parms.VgClusters0 = Utils.RefineBoxes(vgClusters0); + parms.VgClusters = parms.VgClusters0 + .Select((r, i) => new { Index = i, Rect = r }) + .ToDictionary(x => x.Index, x => x.Rect); + + // Calculate character density for text rectangle determination + int blockCount = parms.TextPage.ExtractBlocks().Count; + float charDensity = blockCount > 0 + ? parms.TextPage.ExtractText().Length / (float)blockCount + : 0; + + // Use multi-column detection to get text rectangles + List textRects; + if (tooManyGraphics && charDensity < 20) + { + // This page has too many isolated text pieces for meaningful layout analysis + textRects = new List { parms.Clip }; + } + else + { + try + { + textRects = MultiColumn.ColumnBoxes( + page, + footerMargin: bottom, + headerMargin: top, + noImageText: !forceText, + textpage: parms.TextPage, + paths: parms.ActualPaths, + avoid: parms.TabRects0.Concat(parms.VgClusters0).ToList(), + ignoreImages: ignoreImages); + + // If no columns detected, use the full clip + if (textRects == null || textRects.Count == 0) + { + textRects = new List { parms.Clip }; + } + } + catch + { + // Fallback to full page if column detection fails + textRects = new List { parms.Clip }; + } + } + + // Process each text rectangle + StringBuilder mdOutput = new StringBuilder(); + foreach (Rect textRect in textRects) + { + // Output tables above this text rectangle + mdOutput.Append(OutputTables(parms, textRect, extractWords)); + + // Output images above this text rectangle + mdOutput.Append(OutputImages(parms, textRect, forceText, writeImages, embedImages, + imagePath, imageFormat, filename, dpi, imageSizeLimit, getHeaderId)); + + // Output text inside this rectangle + mdOutput.Append(WriteText(parms, textRect, getHeaderId, forceText, ignoreCode, extractWords)); + } + + // Write any remaining tables and images + mdOutput.Append(OutputTables(parms, null, extractWords)); + mdOutput.Append(OutputImages(parms, null, forceText, writeImages, embedImages, + imagePath, imageFormat, filename, dpi, imageSizeLimit, getHeaderId)); + + // Clean up the output + parms.MdString = mdOutput.ToString(); + parms.MdString = parms.MdString.Replace(" ,", ",").Replace("-\n", ""); + + while (parms.MdString.StartsWith("\n")) + { + parms.MdString = parms.MdString.Substring(1); + } + + parms.MdString = parms.MdString.Replace('\0', Utils.REPLACEMENT_CHARACTER); + + // Handle extract_words mode + if (extractWords) + { + var rawWords = parms.TextPage.ExtractWords(); + rawWords = rawWords.OrderBy(w => w.Y1).ThenBy(w => w.X0).ToList(); + + List words = new List(); + foreach (var lrect in parms.LineRects) + { + var lwords = rawWords + .Where(w => + { + var wrect = new Rect(w.X0, w.Y0, w.X1, w.Y1); + return Utils.BboxInBbox(wrect, lrect); + }) + .ToList(); + words.AddRange(SortWords(lwords)); + } + + // Remove duplicates + List nwords = new List(); + foreach (var w in words) + { + if (!nwords.Any(nw => nw.X0 == w.X0 && nw.Y0 == w.Y0 && nw.X1 == w.X1 && nw.Y1 == w.Y1 && nw.Text == w.Text)) + { + nwords.Add(w); + } + } + parms.Words = nwords.Cast().ToList(); + } + else + { + parms.Words = new List(); + } + + // Add page separators + if (pageSeparators) + { + parms.MdString += $"\n\n--- end of page={page.Number} ---\n\n"; + } + + return parms; + } + } +} diff --git a/MuPDF.NET4LLM/helpers/MultiColumn.cs b/MuPDF.NET4LLM/helpers/MultiColumn.cs new file mode 100644 index 0000000..ac7cfbb --- /dev/null +++ b/MuPDF.NET4LLM/helpers/MultiColumn.cs @@ -0,0 +1,421 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using MuPDF.NET; + +namespace MuPDF.NET4LLM.Helpers +{ + /// + /// Multi-column page detection utilities. + /// Ported and adapted from the Python module helpers/multi_column.py in pymupdf4llm. + /// + public static class MultiColumn + { + /// + /// Determine bboxes which wrap a column on the page + /// + public static List ColumnBoxes( + Page page, + float footerMargin = 50, + float headerMargin = 50, + bool noImageText = true, + TextPage textpage = null, + List paths = null, + List avoid = null, + bool ignoreImages = false) + { + // Compute relevant page area + Rect clip = new Rect(page.Rect); + clip.Y1 -= footerMargin; // Remove footer area + clip.Y0 += headerMargin; // Remove header area + + if (paths == null) + { + paths = page.GetDrawings() + .Where(p => p.Rect.Width < clip.Width && p.Rect.Height < clip.Height) + .ToList(); + } + + if (textpage == null) + { + textpage = page.GetTextPage(clip: clip, flags: (int)TextFlags.TEXT_ACCURATE_BBOXES); + } + + List bboxes = new List(); + List imgBboxes = new List(); + if (avoid != null) + imgBboxes.AddRange(avoid); + + List vertBboxes = new List(); + List pathRects = new List(); + + // Path rectangles + foreach (var p in paths) + { + // Give empty path rectangles some small width or height + Rect prect = new Rect(p.Rect); + float lwidth = p.Width > 0 ? p.Width * 0.5f : 0.5f; + + if (prect.Width == 0) + { + prect.X0 -= lwidth; + prect.X1 += lwidth; + } + if (prect.Height == 0) + { + prect.Y0 -= lwidth; + prect.Y1 += lwidth; + } + pathRects.Add(prect); + } + + // Sort path bboxes by ascending top, then left coordinates + pathRects = pathRects.OrderBy(b => (b.Y0, b.X0)).ToList(); + + // Bboxes of images on page, no need to sort them + if (!ignoreImages) + { + var images = page.GetImages(); + foreach (var item in images) + { + var boxes = page.GetImageRects(item.Xref); + var rects = boxes.Select(b => b.Rect).ToList(); + imgBboxes.AddRange(rects); + } + } + + // Blocks of text on page + PageInfo pageInfo = textpage.ExtractDict(null, false); + List blocks = pageInfo.Blocks; + + // Make block rectangles, ignoring non-horizontal text + foreach (var b in blocks) + { + Rect bbox = new Rect(b.Bbox); // Bbox of the block + + // Ignore text written upon images + if (noImageText && Utils.BboxInBbox(bbox, Utils.JoinRects(imgBboxes))) + continue; + + // Confirm first line to be horizontal + if (b.Lines == null || b.Lines.Count == 0) + continue; + + Line line0 = b.Lines[0]; // Get first line + if (line0.Dir == null || Math.Abs(1 - line0.Dir.X) > 1e-3) // Only (almost) horizontal text + { + vertBboxes.Add(bbox); // A block with non-horizontal text + continue; + } + + Rect srect = new Rect(); + foreach (var line in b.Lines) + { + Rect lbbox = new Rect(line.Bbox); + string text = string.Join("", line.Spans?.Select(s => s.Text) ?? new string[0]); + if (!Utils.IsWhite(text)) + { + srect = Utils.JoinRects(new List { srect, lbbox }); + } + } + bbox = srect; + + if (!Utils.BboxIsEmpty(bbox)) + bboxes.Add(bbox); + } + + // Sort text bboxes by ascending background, top, then left coordinates + bboxes = bboxes.OrderBy(k => (InBbox(k, pathRects), k.Y0, k.X0)).ToList(); + + // Immediately return if no text found + if (bboxes.Count == 0) + return new List(); + + // -------------------------------------------------------------------- + // Join bboxes to establish some column structure + // -------------------------------------------------------------------- + // The final block bboxes on page + List nblocks = new List { bboxes[0] }; // Pre-fill with first bbox + bboxes = bboxes.Skip(1).ToList(); // Remaining old bboxes + Dictionary cache = new Dictionary(); + + for (int i = 0; i < bboxes.Count; i++) // Iterate old bboxes + { + Rect bb = bboxes[i]; + bool check = false; // Indicates unwanted joins + + // Check if bb can extend one of the new blocks + for (int j = 0; j < nblocks.Count; j++) + { + Rect nbb = nblocks[j]; // A new block + + // Never join across columns + if (nbb.X1 < bb.X0 || bb.X1 < nbb.X0) + continue; + + // Never join across different background colors + if (InBboxUsingCache(nbb, pathRects, cache) != InBboxUsingCache(bb, pathRects, cache)) + continue; + + Rect temp = Utils.JoinRects(new List { bb, nbb }); // Temporary extension of new block + check = CanExtend(temp, nbb, nblocks, vertBboxes); + if (check is true) + { + break; + } + } + + if (!check) // Bb cannot be used to extend any of the new bboxes + { + nblocks.Add(bb); // So add it to the list + int j = nblocks.Count - 1; // Index of it + Rect temp = nblocks[j]; // New bbox added + + // Check if some remaining bbox is contained in temp + check = CanExtend(temp, bb, bboxes, vertBboxes); + if (check is false) + { + nblocks.Add(bb); + } + else + { + nblocks[j] = temp; + } + bboxes[i] = null; + } + } + + // Do some elementary cleaning + nblocks = CleanNblocks(nblocks); + if (nblocks.Count == 0) + return nblocks; + + // Several phases of rectangle joining + // TODO: disabled for now as too aggressive: + // nblocks = JoinRectsPhase1(nblocks); + nblocks = JoinRectsPhase2(nblocks); + nblocks = JoinRectsPhase3(nblocks, pathRects, cache); + + // Return identified text bboxes + + if (textpage != null && textpage != page.GetTextPage()) + textpage.Dispose(); + + return nblocks; + } + + private static int InBbox(Rect bb, List bboxes) + { + for (int i = 0; i < bboxes.Count; i++) + { + if (Utils.BboxInBbox(bb, bboxes[i])) + return i + 1; + } + return 0; + } + + private static int InBboxUsingCache(Rect bb, List bboxes, Dictionary cache) + { + string cacheKey = $"{bb.GetHashCode()}_{bboxes.GetHashCode()}"; + if (cache.TryGetValue(cacheKey, out int cached)) + return cached; + + int index = InBbox(bb, bboxes); + cache[cacheKey] = index; + return index; + } + + private static bool IntersectsBboxes(Rect bb, List bboxes) + { + return bboxes.Any(bbox => !Utils.OutsideBbox(bb, bbox, strict: true)); + } + + private static bool CanExtend(Rect temp, Rect bb, List bboxlist, List vertBboxes) + { + foreach (var b in bboxlist) + { + if (!IntersectsBboxes(temp, vertBboxes) && + (b == null || b == bb || Utils.BboxIsEmpty(Utils.IntersectRects(temp, b)))) + continue; + return false; + } + return true; + } + + private static List CleanNblocks(List nblocks) + { + // 1. Remove any duplicate blocks. + if (nblocks.Count < 2) + return nblocks; + + for (int i = nblocks.Count - 1; i > 0; i--) + { + if (nblocks[i].EqualTo(nblocks[i - 1])) + nblocks.RemoveAt(i); + } + + if (nblocks.Count == 0) + return nblocks; + + // 2. Repair sequence in special cases: + // Consecutive bboxes with almost same bottom value are sorted ascending + // by x-coordinate. + float y1 = nblocks[0].Y1; // First bottom coordinate + int i0 = 0; // Its index + int i1 = 0; // Index of last bbox with same bottom + + // Iterate over bboxes, identifying segments with approx. same bottom value. + // Replace every segment by its sorted version. + + for (int i = 1; i < nblocks.Count; i++) + { + Rect b1 = nblocks[i]; + if (Math.Abs(b1.Y1 - y1) > 3) // Different bottom + { + if (i1 > i0) // Segment length > 1? Sort it! + { + var segment = nblocks.Skip(i0).Take(i1 - i0 + 1).OrderBy(b => b.X0).ToList(); + for (int j = 0; j < segment.Count; j++) + nblocks[i0 + j] = segment[j]; + } + y1 = b1.Y1; // Store new bottom value + i0 = i; // Store its start index + } + i1 = i; // Store current index + } + if (i1 > i0) // Segment waiting to be sorted + { + var segment = nblocks.Skip(i0).Take(i1 - i0 + 1).OrderBy(b => b.X0).ToList(); + for (int j = 0; j < segment.Count; j++) + nblocks[i0 + j] = segment[j]; + } + + return nblocks; + } + + private static List JoinRectsPhase2(List bboxes) + { + // Postprocess identified text blocks, phase 2. + // Increase the width of each text block so that small left or right + // border differences are removed. Then try to join even more text + // rectangles. + List prects = bboxes.Select(b => new Rect(b)).ToList(); // Copy of argument list + + for (int i = 0; i < prects.Count; i++) + { + Rect b = prects[i]; + // Go left and right somewhat + float x0 = prects.Where(bb => Math.Abs(bb.X0 - b.X0) <= 3).Min(bb => bb.X0); + float x1 = prects.Where(bb => Math.Abs(bb.X1 - b.X1) <= 3).Max(bb => bb.X1); + b.X0 = x0; // Store new left / right border + b.X1 = x1; + prects[i] = b; + } + + // Sort by left, top + prects = prects.OrderBy(b => (b.X0, b.Y0)).ToList(); + List newRects = new List { prects[0] }; // Initialize with first item + + // Walk through the rest, top to bottom, then left to right + for (int i = 1; i < prects.Count; i++) + { + Rect r = prects[i]; + Rect r0 = newRects[newRects.Count - 1]; // Previous bbox + + // Join if we have similar borders and are not too far down + if (Math.Abs(r.X0 - r0.X0) <= 3 && + Math.Abs(r.X1 - r0.X1) <= 3 && + Math.Abs(r0.Y1 - r.Y0) <= 10) + { + r0 = Utils.JoinRects(new List { r0, r }); + newRects[newRects.Count - 1] = r0; + continue; + } + // Else append this as new text block + newRects.Add(r); + } + return newRects; + } + + private static List JoinRectsPhase3(List bboxes, List pathRects, Dictionary cache) + { + List prects = bboxes.Select(b => new Rect(b)).ToList(); + List newRects = new List(); + + while (prects.Count > 0) + { + Rect prect0 = prects[0]; + bool repeat = true; + while (repeat) + { + repeat = false; + for (int i = prects.Count - 1; i > 0; i--) + { + Rect prect1 = prects[i]; + // Do not join across columns + if (prect1.X0 > prect0.X1 || prect1.X1 < prect0.X0) + continue; + + // Do not join different backgrounds + if (InBboxUsingCache(prect0, pathRects, cache) != InBboxUsingCache(prect1, pathRects, cache)) + continue; + + Rect temp = Utils.JoinRects(new List { prect0, prect1 }); + var intersecting = prects.Concat(newRects).Where(b => b.Intersects(temp)).ToList(); + if (intersecting.Count == 2 && intersecting.Contains(prect0) && intersecting.Contains(prect1)) + { + prect0 = temp; + prects[0] = prect0; + prects.RemoveAt(i); + repeat = true; + } + } + } + newRects.Add(prect0); + prects.RemoveAt(0); + } + + // Hopefully the most reasonable sorting sequence: + // At this point we have finished identifying blocks that wrap text. + // We now need to determine the SEQUENCE by which text extraction from + // these blocks should take place. This is hardly possible with 100% + // certainty. Our sorting approach is guided by the following thought: + // 1. Extraction should start with the block whose top-left corner is the + // left-most and top-most. + // 2. Any blocks further to the right should be extracted later - even if + // their top-left corner is higher up on the page. + // 3. Sorting the identified rectangles must therefore happen using a + // tuple (y, x) as key, where y is not smaller (= higher up) than that + // of the left-most block with a non-empty vertical overlap. + // 4. To continue "left block" with "next is ...", its sort key must be + // tuple (P.y, Q.x). + var sortRects = newRects.Select(box => + { + // Search for the left-most rect that overlaps like "P" above + // Candidates must have the same background + int background = InBbox(box, pathRects); // This background + var leftRects = newRects + .Where(r => r.X1 < box.X0 && + (box.Y0 <= r.Y0 && r.Y0 <= box.Y1 || box.Y0 <= r.Y1 && r.Y1 <= box.Y1)) + .OrderBy(r => r.X1) + .ToList(); + + (float y, float x) key; + if (leftRects.Count > 0) // If a "P" rectangle was found ... + { + key = (leftRects[leftRects.Count - 1].Y0, box.X0); // Use this key + } + else + { + key = (box.Y0, box.X0); // Else use the original (Q.y, Q.x). + } + return (box, key); + }) + .OrderBy(sr => sr.key) // By computed key + .Select(sr => sr.box) // Extract sorted rectangles + .ToList(); + + return sortRects; + } + } +} diff --git a/MuPDF.NET4LLM/helpers/Progress.cs b/MuPDF.NET4LLM/helpers/Progress.cs new file mode 100644 index 0000000..18caf67 --- /dev/null +++ b/MuPDF.NET4LLM/helpers/Progress.cs @@ -0,0 +1,91 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; + +namespace MuPDF.NET4LLM.Helpers +{ + /// + /// Text-based progress bar to allow watching the advancement + /// of Markdown conversion of document pages. + /// Ported and adapted from the Python helpers/progress.py in pymupdf4llm. + /// + /// Copyright and License + /// Copyright 2024 Artifex Software, Inc. + /// License GNU Affero GPL 3.0 + /// + public class _ProgressBar : IEnumerator + { + private readonly List _items; + private readonly int _progressWidth; + private readonly int _lenDigits; + private float _progressBarValue; + private int _currentIndex; + private IEnumerator _enumerator; + + public _ProgressBar(List items, int progressWidth = 40) + { + _items = items; + _progressWidth = progressWidth; + _lenDigits = items.Count.ToString().Length; + _progressBarValue = 0; + _currentIndex = -1; // Start at -1 for initial MoveNext to work + _enumerator = items.GetEnumerator(); + + // Calculate the increment for each item based on the list length and the progress width + // Init progress bar + Console.Write($"[{new string(' ', _progressWidth)}] (0/{_items.Count})"); + Console.Out.Flush(); + Console.Write($"\b{_progressWidth + _lenDigits + 6}"); + } + + public object Current => _enumerator.Current; + + public bool MoveNext() + { + if (!_enumerator.MoveNext()) + { + // End progress on StopIteration + Console.WriteLine("]\n"); + return false; + } + + // Update the current index + _currentIndex++; + + // Add the increment to the progress bar and calculate how many "=" to add + _progressBarValue += (float)_progressWidth / _items.Count; + + int filledLength = (int)(_currentIndex * (float)_progressWidth / _items.Count); + // Update the numerical progress + string paddedIndex = (_currentIndex + 1).ToString().PadLeft(_lenDigits); + string progressInfo = $" ({paddedIndex}/{_items.Count})"; + + Console.Write($"\r[{new string('=', filledLength)}{new string(' ', _progressWidth - filledLength)}]"); + Console.Write(progressInfo); + Console.Out.Flush(); + + return true; + } + + public void Reset() + { + _currentIndex = -1; + _progressBarValue = 0; + _enumerator.Reset(); + } + + public void Dispose() + { + _enumerator?.Dispose(); + } + } + + public static class ProgressBar + { + public static IEnumerator Create(List list, int progressWidth = 40) + { + return new _ProgressBar(list, progressWidth); + } + } +} diff --git a/MuPDF.NET4LLM/helpers/Utils.cs b/MuPDF.NET4LLM/helpers/Utils.cs new file mode 100644 index 0000000..385c55e --- /dev/null +++ b/MuPDF.NET4LLM/helpers/Utils.cs @@ -0,0 +1,669 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using MuPDF.NET; +using mupdf; + +namespace MuPDF.NET4LLM.Helpers +{ + /// + /// Utility functions for PDF processing and layout analysis. + /// Ported and adapted from the Python module helpers/utils.py in pymupdf4llm. + /// + public static class Utils + { + // Constants + public static readonly HashSet WHITE_CHARS = new HashSet( + Enumerable.Range(0, 33).Select(i => (char)i) + .Concat(new[] + { + '\u00a0', // Non-breaking space + '\u2000', // En quad + '\u2001', // Em quad + '\u2002', // En space + '\u2003', // Em space + '\u2004', // Three-per-em space + '\u2005', // Four-per-em space + '\u2006', // Six-per-em space + '\u2007', // Figure space + '\u2008', // Punctuation space + '\u2009', // Thin space + '\u200a', // Hair space + '\u202f', // Narrow no-break space + '\u205f', // Medium mathematical space + '\u3000', // Ideographic space + }) + ); + + public const char REPLACEMENT_CHARACTER = '\uFFFD'; + public const string TYPE3_FONT_NAME = "Unnamed-T3"; + + public static readonly HashSet BULLETS = new HashSet( + new[] + { + '\u002A', // * + '\u002D', // - + '\u003E', // > + '\u006F', // o + '\u00B6', // ¶ + '\u00B7', // · + '\u2010', // ‐ + '\u2011', // ‑ + '\u2012', // ‒ + '\u2013', // – + '\u2014', // — + '\u2015', // ― + '\u2020', // † + '\u2021', // ‡ + '\u2022', // • + '\u2212', // − + '\u2219', // ∙ + '\uF0A7', // Private use + '\uF0B7', // Private use + REPLACEMENT_CHARACTER, + } + .Concat(Enumerable.Range(0x25A0, 0x2600 - 0x25A0).Select(i => (char)i)) + ); + + public static int FLAGS = (int)( + mupdf.mupdf.FZ_STEXT_COLLECT_STYLES | + mupdf.mupdf.FZ_STEXT_COLLECT_VECTORS | + (int)TextFlags.TEXT_PRESERVE_IMAGES | + (int)TextFlags.TEXT_ACCURATE_BBOXES | + (int)TextFlags.TEXT_MEDIABOX_CLIP + ); + + /// + /// Traverse /AcroForm/Fields hierarchy and return a dict: + /// fully qualified field name -> {"value": ..., "pages": [...]} + /// Optionally, the xref of the field is included. + /// + public static Dictionary> ExtractFormFieldsWithPages(Document doc, bool xrefs = false) + { + // Access the AcroForm dictionary. + // Fast exit if not present or empty. + // Placeholder - would need to access PDF internals + return new Dictionary>(); + } + + /// + /// Normalize a folder path ("" = script folder), ensure it exists, + /// and return a Markdown-safe file reference using forward slashes. + /// Prefers relative paths to avoid Windows drive-letter issues. + /// + public static (string mdRef, string actualPath) MdPath(string folder, string filename) + { + // 1. Use current working directory as script dir. + string scriptDir = Directory.GetCurrentDirectory(); + string basePath; + + if (string.IsNullOrWhiteSpace(folder)) + { + basePath = scriptDir; + } + else + { + basePath = Environment.ExpandEnvironmentVariables(folder); + basePath = Path.GetFullPath(basePath); + } + + // 2. Create folder if it doesn't exist + Directory.CreateDirectory(basePath); + + // 3. Build full file path + string fullPath = Path.Combine(basePath, Path.GetFileName(filename)); + string mdRef; + + // 4. Try to compute a relative path (best for Markdown) + // Calculate relative path manually for compatibility with .NET Standard 2.0 + // Path.GetRelativePath is only available in .NET Core 2.1+ and .NET Standard 2.1+ + if (fullPath.StartsWith(scriptDir, StringComparison.OrdinalIgnoreCase)) + { + string relative = fullPath.Substring(scriptDir.Length).TrimStart(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar); + mdRef = relative.Replace("\\", "/"); + if (!string.IsNullOrEmpty(mdRef) && !mdRef.StartsWith(".")) + mdRef = "./" + mdRef; + } + else + { + // Not relative → fall back to POSIX path + mdRef = fullPath.Replace("\\", "/"); + } + // 5. Escape Markdown-sensitive characters + // Escaping bracket is for MD references only, not for actual file saving. + // The first item is the MD-safe form, + // the second is the actual path to use in pixmap saving. + mdRef = mdRef.Replace("(", "-").Replace(")", "-") + .Replace("[", "-").Replace("]", "-"); + + return (mdRef, fullPath); + } + + /// + /// Check if text starts with a bullet character + /// + public static bool StartswithBullet(string text) + { + if (string.IsNullOrEmpty(text)) + return false; + if (!BULLETS.Contains(text[0])) + return false; + if (text.Length == 1) + return true; + if (text[1] == ' ') + return true; + return false; + } + + /// + /// Identify white text + /// + public static bool IsWhite(string text) + { + if (string.IsNullOrEmpty(text)) + return true; + return text.All(c => WHITE_CHARS.Contains(c)); + } + + /// + /// Check if bounding box is empty + /// + public static bool BboxIsEmpty(Rect bbox) + { + if (bbox == null) + return true; + return bbox.X0 >= bbox.X1 || bbox.Y0 >= bbox.Y1; + } + + /// + /// Intersect two rectangles + /// + public static Rect IntersectRects(Rect r1, Rect r2, bool bboxOnly = false) + { + if (r1 == null || r2 == null) + return new Rect(); + + float x0 = Math.Max(r1.X0, r2.X0); + float y0 = Math.Max(r1.Y0, r2.Y0); + float x1 = Math.Min(r1.X1, r2.X1); + float y1 = Math.Min(r1.Y1, r2.Y1); + + if (x0 >= x1 || y0 >= y1) + return new Rect(); + + return new Rect(x0, y0, x1, y1); + } + + /// + /// Join a list of rectangles into their bounding rectangle + /// + public static Rect JoinRects(List rects, bool bboxOnly = false) + { + if (rects == null || rects.Count == 0) + return new Rect(); + + float x0 = rects.Min(r => r.X0); + float y0 = rects.Min(r => r.Y0); + float x1 = rects.Max(r => r.X1); + float y1 = rects.Max(r => r.Y1); + + return new Rect(x0, y0, x1, y1); + } + + /// + /// Check if bbox is almost entirely within clip + /// + public static bool AlmostInBbox(Rect bbox, Rect clip, float portion = 0.8f) + { + if (bbox == null || clip == null) + return false; + + float x0 = Math.Max(bbox.X0, clip.X0); + float y0 = Math.Max(bbox.Y0, clip.Y0); + float x1 = Math.Min(bbox.X1, clip.X1); + float y1 = Math.Min(bbox.Y1, clip.Y1); + + float interArea = Math.Max(0, x1 - x0) * Math.Max(0, y1 - y0); + float boxArea = (bbox.X1 - bbox.X0) * (bbox.Y1 - bbox.Y0); + + // If intersection area is greater than portion of box area + return interArea > boxArea * portion; + } + + /// + /// Check if bbox is outside cell + /// + public static bool OutsideBbox(Rect bbox, Rect cell, bool strict = false) + { + if (bbox == null || cell == null) + return true; + + if (!strict) + { + return bbox.X0 >= cell.X1 || bbox.X1 <= cell.X0 || + bbox.Y0 >= cell.Y1 || bbox.Y1 <= cell.Y0; + } + else + { + return bbox.X0 > cell.X1 || bbox.X1 < cell.X0 || + bbox.Y0 > cell.Y1 || bbox.Y1 < cell.Y0; + } + } + + /// + /// Check if inner rectangle is contained within outer rectangle + /// + public static bool BboxInBbox(Rect inner, Rect outer) + { + if (inner == null || outer == null) + return false; + + return outer.X0 <= inner.X0 && outer.Y0 <= inner.Y0 && + outer.X1 >= inner.X1 && outer.Y1 >= inner.Y1; + } + + /// + /// Check if rect is contained in any rect of the list + /// + public static bool BboxInAnyBbox(Rect rect, IEnumerable rectList) + { + if (rect == null || rectList == null) + return false; + + return rectList.Any(r => BboxInBbox(rect, r)); + } + + /// + /// Check if rect is outside all rects in the list + /// + public static bool OutsideAllBboxes(Rect rect, IEnumerable rectList) + { + if (rect == null || rectList == null) + return true; + + return rectList.All(r => OutsideBbox(rect, r)); + } + + /// + /// Check if middle of rect is contained in any rect of the list + /// + public static bool AlmostInAnyBbox(Rect rect, IEnumerable rectList, float portion = 0.5f) + { + if (rect == null || rectList == null) + return false; + + // Enlarge rect slightly + Rect enlarged = new Rect( + rect.X0 - 1, + rect.Y0 - 1, + rect.X1 + 1, + rect.Y1 + 1 + ); + + return rectList.Any(r => AlmostInBbox(enlarged, r, portion)); + } + + /// + /// Join any rectangles with a pairwise non-empty overlap. + /// Accepts and returns a list of Rect items. + /// Note that rectangles that only "touch" each other (common point or edge) + /// are not considered as overlapping. + /// Use a positive "enlarge" parameter to enlarge rectangle by these many + /// points in every direction. + /// TODO: Consider using a sweeping line algorithm for this. + /// + public static List RefineBoxes(List boxes, float enlarge = 0) + { + if (boxes == null || boxes.Count == 0) + return new List(); + + List newRects = new List(); + // List of all vector graphic rectangles + List prects = boxes.Select(b => new Rect(b)).ToList(); + + while (prects.Count > 0) // The algorithm will empty this list + { + Rect r = new Rect(prects[0]); // Copy of first rectangle + r.X0 -= enlarge; + r.Y0 -= enlarge; + r.X1 += enlarge; + r.Y1 += enlarge; + + bool repeat = true; // Initialize condition + while (repeat) + { + repeat = false; // Set false as default + for (int i = prects.Count - 1; i > 0; i--) // From back to front + { + if (r.Intersects(prects[i])) // Enlarge first rect with this + { + r = Utils.JoinRects(new List { r, prects[i] }); + prects.RemoveAt(i); // Delete this rect + repeat = true; // Indicate must try again + } + } + } + + // First rect now includes all overlaps + newRects.Add(r); + prects.RemoveAt(0); + } + + return newRects + .OrderBy(r => r.X0) + .ThenBy(r => r.Y0) + .ToList(); // Sort by left, top + } + + /// + /// Determine the background color of the page + /// + public static float[] GetBgColor(Page page) + { + if (page == null) + return null; + + try + { + // Check upper left corner + Rect ulRect = new Rect(page.Rect.X0, page.Rect.Y0, page.Rect.X0 + 10, page.Rect.Y0 + 10); + Pixmap pixUL = page.GetPixmap(clip: ulRect); + if (pixUL == null || pixUL.SAMPLES == null || !pixUL.IsUniColor) + { + pixUL?.Dispose(); + return null; + } + var pixelUL = pixUL.GetPixel(0, 0); + pixUL.Dispose(); + + // Check upper right corner + Rect urRect = new Rect(page.Rect.X1 - 10, page.Rect.Y0, page.Rect.X1, page.Rect.Y0 + 10); + Pixmap pixUR = page.GetPixmap(clip: urRect); + if (pixUR == null || pixUR.SAMPLES == null || !pixUR.IsUniColor) + { + pixUR?.Dispose(); + return null; + } + var pixelUR = pixUR.GetPixel(0, 0); + pixUR.Dispose(); + + if (pixelUL.Length != pixelUR.Length || + !pixelUL.SequenceEqual(pixelUR)) + return null; + + // Check lower left corner + Rect llRect = new Rect(page.Rect.X0, page.Rect.Y1 - 10, page.Rect.X0 + 10, page.Rect.Y1); + Pixmap pixLL = page.GetPixmap(clip: llRect); + if (pixLL == null || pixLL.SAMPLES == null || !pixLL.IsUniColor) + { + pixLL?.Dispose(); + return null; + } + var pixelLL = pixLL.GetPixel(0, 0); + pixLL.Dispose(); + + if (pixelUL.Length != pixelLL.Length || + !pixelUL.SequenceEqual(pixelLL)) + return null; + + // Check lower right corner + Rect lrRect = new Rect(page.Rect.X1 - 10, page.Rect.Y1 - 10, page.Rect.X1, page.Rect.Y1); + Pixmap pixLR = page.GetPixmap(clip: lrRect); + if (pixLR == null || pixLR.SAMPLES == null || !pixLR.IsUniColor) + { + pixLR?.Dispose(); + return null; + } + var pixelLR = pixLR.GetPixel(0, 0); + pixLR.Dispose(); + + if (pixelUL.Length != pixelLR.Length || + !pixelUL.SequenceEqual(pixelLR)) + return null; + + // All corners match - return normalized RGB + if (pixelUL.Length >= 3) + { + return new float[] + { + pixelUL[0] / 255f, + pixelUL[1] / 255f, + pixelUL[2] / 255f + }; + } + } + catch + { + // If background detection fails, return null + } + + return null; + } + + /// + /// Check whether the rectangle contains significant drawings + /// + public static bool IsSignificant(Rect box, List paths) + { + if (box == null || paths == null || paths.Count == 0) + return false; + + // Build a sub-box of 90% of the original box + // To this end, we build a sub-box of 90% of the original box and check + // whether this still contains drawing paths. + float d; + if (box.Width > box.Height) + d = box.Width * 0.025f; + else + d = box.Height * 0.025f; + + Rect nbox = new Rect( + box.X0 + d, + box.Y0 + d, + box.X1 - d, + box.Y1 - d + ); // Nbox covers 90% of box interior + + // Paths contained in, but not equal to box + var myPaths = paths + .Where(p => p.Rect != null && + BboxInBbox(p.Rect, box) && + !p.Rect.EqualTo(box)) + .ToList(); + + if (myPaths.Count == 0) + return false; + + // Check if all paths are horizontal or vertical lines + var widths = myPaths.Select(p => (int)Math.Round(p.Rect.Width)) + .Concat(new[] { (int)Math.Round(box.Width) }) + .Distinct() + .ToList(); + var heights = myPaths.Select(p => (int)Math.Round(p.Rect.Height)) + .Concat(new[] { (int)Math.Round(box.Height) }) + .Distinct() + .ToList(); + + if (widths.Count == 1 || heights.Count == 1) + return false; // All paths are horizontal or vertical lines / rectangles + + // Check if any path intersects the interior + foreach (var p in myPaths) + { + Rect rect = p.Rect; + if (!( + BboxIsEmpty(rect) || BboxIsEmpty(IntersectRects(rect, nbox)) + )) // Intersects interior: significant! + { + return true; + } + // Remaining case: a horizontal or vertical line + // Horizontal line: + if ( + true + && Math.Abs(rect.Y0 - rect.Y1) < 0.1f + && nbox.Y0 <= rect.Y0 && rect.Y0 <= nbox.Y1 + && rect.X0 < nbox.X1 + && rect.X1 > nbox.X0 + ) + { + return true; + } + // Vertical line + if ( + true + && Math.Abs(rect.X0 - rect.X1) < 0.1f + && nbox.X0 <= rect.X0 && rect.X0 <= nbox.X1 + && rect.Y0 < nbox.Y1 + && rect.Y1 > nbox.Y0 + ) + { + return true; + } + } + + return false; + } + + /// + /// Expand bbox to include all points + /// + public static (float x0, float y0, float x1, float y1) ExpandBboxByPoints( + (float x0, float y0, float x1, float y1) bbox, + List points) + { + if (points == null || points.Count == 0) + return bbox; + + float x0 = Math.Min(points.Min(p => p.X), bbox.x0); + float y0 = Math.Min(points.Min(p => p.Y), bbox.y0); + float x1 = Math.Max(points.Max(p => p.X), bbox.x1); + float y1 = Math.Max(points.Max(p => p.Y), bbox.y1); + + return (x0, y0, x1, y1); + } + + /// + /// Analyze the page for OCR decision + /// + public static Dictionary AnalyzePage(Page page, List blocks = null) + { + int charsTotal = 0; + int charsBad = 0; + + if (blocks == null) + { + TextPage textPage = page.GetTextPage( + clip: new Rect(float.NegativeInfinity, float.NegativeInfinity, + float.PositiveInfinity, float.PositiveInfinity), + flags: FLAGS); + PageInfo pageInfo = textPage.ExtractDict(null, false); + blocks = pageInfo.Blocks; + textPage.Dispose(); + } + + Rect imgRect = new Rect(); + Rect txtRect = new Rect(); + Rect vecRect = new Rect(); + float imgArea = 0; + float txtArea = 0; + float vecArea = 0; + int ocrSpans = 0; + + foreach (var b in blocks) + { + // Intersect each block bbox with the page rectangle. + // Note that this has no effect on text because of the clipping flags, + // which causes that we will not see ANY clipped text. + Rect bbox = IntersectRects(page.Rect, b.Bbox); + float area = bbox.Width * bbox.Height; + if (area == 0.0f) // Skip any empty block + continue; + + if (b.Type == 1) // Image block + { + imgRect = JoinRects(new List { imgRect, bbox }); + imgArea += area; + } + else if (b.Type == 0) // Text block + { + if (BboxIsEmpty(b.Bbox)) + continue; + + if (b.Lines != null) + { + foreach (var line in b.Lines) + { + if (BboxIsEmpty(line.Bbox)) + continue; + + if (line.Spans != null) + { + foreach (var span in line.Spans) + { + string text = span.Text ?? ""; + if (IsWhite(text)) + continue; + + Rect sr = IntersectRects(page.Rect, span.Bbox); + if (BboxIsEmpty(sr)) + continue; + + // Check for OCR spans: font is "GlyphLessFont" or + // (char_flags & 8 == 0 and char_flags & 16 == 0) + // Note: CharFlags and Alpha may need to be accessed differently + // For now, check font name for OCR detection + if (span.Font == "GlyphLessFont") + { + ocrSpans++; + } + // Alpha check would need to be implemented based on available API + // Skip invisible text (alpha == 0) + + charsTotal += text.Trim().Length; + charsBad += text.Count(c => c == REPLACEMENT_CHARACTER); + txtRect = JoinRects(new List { txtRect, sr }); + txtArea += sr.Width * sr.Height; + } + } + } + } + } + else if ( + true + && b.Type == 3 // Vector block + // && b.Stroked // Note: Stroked and IsRect may not be available + && 2 < bbox.Width && bbox.Width <= 20 // Width limit for typical characters + && 2 < bbox.Height && bbox.Height <= 20 // Height limit for typical characters + // && !b.IsRect // Contains curves + ) + { + // Potential character-like vector block + vecRect = JoinRects(new List { vecRect, bbox }); + vecArea += area; + } + } + + // The rectangle on page covered by some content + Rect covered = JoinRects(new List { imgRect, txtRect, vecRect }); + float coverArea = Math.Abs(covered.Width * covered.Height); + + // The area-related float values are computed as fractions of the total covered area. + return new Dictionary + { + ["covered"] = covered, // Page area covered by content + ["img_joins"] = coverArea > 0 ? Math.Abs(imgRect.Width * imgRect.Height) / coverArea : 0, // Fraction of area of the joined images + ["img_area"] = coverArea > 0 ? imgArea / coverArea : 0, // Fraction of sum of image area sizes + ["txt_joins"] = coverArea > 0 ? Math.Abs(txtRect.Width * txtRect.Height) / coverArea : 0, // Fraction of area of the joined text spans + ["txt_area"] = coverArea > 0 ? txtArea / coverArea : 0, // Fraction of sum of text span bbox area sizes + ["vec_area"] = coverArea > 0 ? vecArea / coverArea : 0, // Fraction of sum of vector character area sizes + ["vec_joins"] = coverArea > 0 ? Math.Abs(vecRect.Width * vecRect.Height) / coverArea : 0, // Fraction of area of the joined vector characters + ["chars_total"] = charsTotal, // Count of visible characters + ["chars_bad"] = charsBad, // Count of Replacement Unicode characters + ["ocr_spans"] = ocrSpans, // Count: text spans with ignored text (render mode 3) + }; + } + } +} diff --git a/MuPDF.NET4LLM/llama/PDFMarkdownReader.cs b/MuPDF.NET4LLM/llama/PDFMarkdownReader.cs new file mode 100644 index 0000000..09fa80a --- /dev/null +++ b/MuPDF.NET4LLM/llama/PDFMarkdownReader.cs @@ -0,0 +1,152 @@ +using System; +using System.Collections.Generic; +using System.IO; +using MuPDF.NET; + +namespace MuPDF.NET4LLM.Llama +{ + /// + /// LlamaIndex-compatible PDF reader using MuPDF.NET4LLM. + /// Ported and adapted from the Python module llama/pdf_markdown_reader.py. + /// Note: This is a C# implementation that provides similar functionality + /// to the original Python `PDFMarkdownReader`. + /// + public class PDFMarkdownReader + { + public Func, Dictionary> MetaFilter { get; set; } + + public PDFMarkdownReader(Func, Dictionary> metaFilter = null) + { + MetaFilter = metaFilter; + } + + /// + /// Loads list of documents from PDF file and also accepts extra information in dict format. + /// + /// + /// Path-like object (string or Path-like) pointing to the PDF file. + /// + /// + /// Optional base metadata dictionary that is copied and enriched per page + /// (file path, page number, total pages, document metadata). + /// + /// + /// Optional keyword arguments controlling rendering: + /// write_images, embed_images, image_path, + /// image_format, force_text, show_progress – these are + /// forwarded to . + /// + /// + /// A list of instances, one per page, whose + /// contains Markdown for that page and whose + /// holds page‑level metadata. + /// + public List LoadData( + object filePath, // Can be Path or string + Dictionary extraInfo = null, + Dictionary loadKwargs = null) + { + if (filePath == null) + throw new ArgumentNullException(nameof(filePath)); + + string filePathStr = filePath is string str ? str : filePath.ToString(); + if (!File.Exists(filePathStr)) + throw new FileNotFoundException($"File not found: {filePathStr}"); + + if (extraInfo == null) + extraInfo = new Dictionary(); + + if (loadKwargs == null) + loadKwargs = new Dictionary(); + + // Extract text header information + var hdrInfo = new Helpers.IdentifyHeaders(filePathStr); + + Document doc = new Document(filePathStr); + List docs = new List(); + + try + { + for (int i = 0; i < doc.PageCount; i++) + { + docs.Add(ProcessDocPage( + doc, extraInfo, filePathStr, i, hdrInfo, loadKwargs)); + } + } + finally + { + doc.Close(); + } + + return docs; + } + + private LlamaIndexDocument ProcessDocPage( + Document doc, + Dictionary extraInfo, + string filePath, + int pageNumber, + object hdrInfo, + Dictionary loadKwargs) + { + extraInfo = ProcessDocMeta(doc, filePath, pageNumber, extraInfo); + + if (MetaFilter != null) + extraInfo = MetaFilter(extraInfo); + + string text = Helpers.MuPdfRag.ToMarkdown( + doc, + pages: new List { pageNumber }, + hdrInfo: hdrInfo, + writeImages: loadKwargs.ContainsKey("write_images") && (bool)loadKwargs["write_images"], + embedImages: loadKwargs.ContainsKey("embed_images") && (bool)loadKwargs["embed_images"], + imagePath: loadKwargs.ContainsKey("image_path") ? (string)loadKwargs["image_path"] : "", + imageFormat: loadKwargs.ContainsKey("image_format") ? (string)loadKwargs["image_format"] : "png", + filename: filePath, + forceText: loadKwargs.ContainsKey("force_text") ? (bool)loadKwargs["force_text"] : true, + showProgress: loadKwargs.ContainsKey("show_progress") && (bool)loadKwargs["show_progress"] + ); + + return new LlamaIndexDocument + { + Text = text, + ExtraInfo = extraInfo + }; + } + + /// + /// Process metas of a PDF document. + /// + private Dictionary ProcessDocMeta( + Document doc, + string filePath, + int pageNumber, + Dictionary extraInfo) + { + if (extraInfo == null) + extraInfo = new Dictionary(); + + // Add document metadata + var metadata = doc.MetaData; + foreach (var kvp in metadata) + { + extraInfo[kvp.Key] = kvp.Value; + } + + extraInfo["page"] = pageNumber + 1; + extraInfo["total_pages"] = doc.PageCount; + extraInfo["file_path"] = filePath; + + return extraInfo; + } + } + + /// + /// Document structure for LlamaIndex compatibility + /// + public class LlamaIndexDocument + { + public string Text { get; set; } + public Dictionary ExtraInfo { get; set; } + } +}