diff --git a/Demo/Demo.csproj b/Demo/Demo.csproj
index 5efb0a5..4d71245 100644
--- a/Demo/Demo.csproj
+++ b/Demo/Demo.csproj
@@ -8,6 +8,7 @@
 
   <ItemGroup>
     <ProjectReference Include="..\MuPDF.NET\MuPDF.NET.csproj" />
+    <ProjectReference Include="..\MuPDF.NET4LLM\MuPDF.NET4LLM.csproj" />
   </ItemGroup>
 
 </Project>
diff --git a/Demo/Program.cs b/Demo/Program.cs
index a46fb18..85fa730 100644
--- a/Demo/Program.cs
+++ b/Demo/Program.cs
@@ -1,5 +1,7 @@
-﻿using mupdf;
+using mupdf;
 using MuPDF.NET;
+using MuPDF.NET4LLM;
+using MuPDF.NET4LLM.Helpers;
 using SkiaSharp;
 using System;
 using System.Collections.Generic;
@@ -18,6 +20,7 @@
 using Font = MuPDF.NET.Font;
 using Morph = MuPDF.NET.Morph;
 using TextWriter = MuPDF.NET.TextWriter;
+using Utils = MuPDF.NET.Utils;
 
 namespace Demo
 {
@@ -39,46 +42,433 @@ class Program
     {
         static void Main(string[] args)
         {
-            TestInsertHtmlbox();
-            TestLineAnnot();
-            AnnotationsFreeText1.Run(args);
-            AnnotationsFreeText2.Run(args);
-            NewAnnots.Run(args);
-            TestHelloWorldToNewDocument(args);
-            TestHelloWorldToExistingDocument(args);
-            TestReadBarcode(args);
-            TestReadDataMatrix();
-            TestWriteBarcode(args);
-            TestExtractTextWithLayout(args);
-            TestWidget(args);
-            TestColor(args);
-            TestCMYKRecolor(args);
-            TestSVGRecolor(args);
-            TestReplaceImage(args);
-            TestInsertImage(args);
-            TestGetImageInfo(args);
-            TestGetTextPageOcr(args);
-            TestCreateImagePage(args);
-            TestJoinPdfPages(args);
-            TestFreeTextAnnot(args);
-            TestTextFont(args);
-            TestMemoryLeak();
-            TestDrawLine();
-            TestWriteBarcode1();
-            TestUnicodeDocument();
-            TestMorph();
-            TestMetadata();
-            TestMoveFile();
-            TestImageFilter();
-            TestImageFilterOcr();
-            CreateAnnotDocument();
-            TestDrawShape();
-            TestIssue213();
-            TestIssue1880();
+            //TestInsertHtmlbox();
+            //TestLineAnnot();
+            //AnnotationsFreeText1.Run(args);
+            //AnnotationsFreeText2.Run(args);
+            //NewAnnots.Run(args);
+            //TestHelloWorldToNewDocument(args);
+            //TestHelloWorldToExistingDocument(args);
+            //TestReadBarcode(args);
+            //TestReadDataMatrix();
+            //TestWriteBarcode(args);
+            //TestExtractTextWithLayout(args);
+            //TestWidget(args);
+            //TestColor(args);
+            //TestCMYKRecolor(args);
+            //TestSVGRecolor(args);
+            //TestReplaceImage(args);
+            //TestInsertImage(args);
+            //TestGetImageInfo(args);
+            //TestGetTextPageOcr(args);
+            //TestCreateImagePage(args);
+            //TestJoinPdfPages(args);
+            //TestFreeTextAnnot(args);
+            //TestTextFont(args);
+            //TestMemoryLeak();
+            //TestDrawLine();
+            //TestWriteBarcode1();
+            //TestUnicodeDocument();
+            //TestMorph();
+            //TestMetadata();
+            //TestMoveFile();
+            //TestImageFilter();
+            //TestImageFilterOcr();
+            //CreateAnnotDocument();
+            //TestDrawShape();
+            //TestIssue213();
+            //TestIssue1880();
+            //TestLLM();
+            TestPyMuPdfRagToMarkdown(); // Uncomment to test PyMuPdfRag.ToMarkdown() directly
+            //TestTable();
 
             return;
         }
 
+        static void TestTable()
+        {
+            Console.WriteLine("\n=== TestTable =======================");
+
+            try
+            {
+                string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf");
+                
+                if (!File.Exists(testFilePath))
+                {
+                    Console.WriteLine($"Error: Test file not found: {testFilePath}");
+                    return;
+                }
+
+                Console.WriteLine($"Loading PDF: {testFilePath}");
+                Document doc = new Document(testFilePath);
+                Console.WriteLine($"Document loaded: {doc.PageCount} page(s)");
+
+                // Test on first page
+                Page page = doc[0];
+                Console.WriteLine($"\nPage 0 - Rect: {page.Rect}");
+
+                // Test 1: Get tables with default strategy
+                Console.WriteLine("\n--- Test 1: Get tables with 'lines_strict' strategy ---");
+                List<Table> tables = Utils.GetTables(
+                    page, 
+                    clip: page.Rect, 
+                    vertical_strategy: "lines_strict",
+                    horizontal_strategy: "lines_strict");
+
+                Console.WriteLine($"Found {tables.Count} table(s) on page 0");
+
+                if (tables.Count > 0)
+                {
+                    for (int i = 0; i < tables.Count; i++)
+                    {
+                        Table table = tables[i];
+                        Console.WriteLine($"\n  Table {i + 1}:");
+                        Console.WriteLine($"    Rows: {table.row_count}");
+                        Console.WriteLine($"    Columns: {table.col_count}");
+                        if (table.bbox != null)
+                        {
+                            Console.WriteLine($"    BBox: ({table.bbox.X0:F2}, {table.bbox.Y0:F2}, {table.bbox.X1:F2}, {table.bbox.Y1:F2})");
+                        }
+
+                        // Display header information
+                        if (table.header != null)
+                        {
+                            Console.WriteLine($"    Header:");
+                            Console.WriteLine($"      External: {table.header.external}");
+                            if (table.header.names != null && table.header.names.Count > 0)
+                            {
+                                Console.WriteLine($"      Column names: {string.Join(", ", table.header.names)}");
+                            }
+                        }
+
+                        // Extract table data
+                        Console.WriteLine($"\n    Extracting table data...");
+                        List<List<string>> tableData = table.Extract();
+                        if (tableData != null && tableData.Count > 0)
+                        {
+                            Console.WriteLine($"    Extracted {tableData.Count} row(s) of data");
+                            // Show first few rows as preview
+                            int previewRows = Math.Min(3, tableData.Count);
+                            for (int row = 0; row < previewRows; row++)
+                            {
+                                var rowData = tableData[row];
+                                if (rowData != null)
+                                {
+                                    Console.WriteLine($"      Row {row + 1}: {string.Join(" | ", rowData.Take(5))}"); // Show first 5 columns
+                                }
+                            }
+                            if (tableData.Count > previewRows)
+                            {
+                                Console.WriteLine($"      ... and {tableData.Count - previewRows} more row(s)");
+                            }
+                        }
+
+                        // Convert to markdown
+                        Console.WriteLine($"\n    Converting to Markdown...");
+                        try
+                        {
+                            string markdown = table.ToMarkdown(clean: false, fillEmpty: true);
+                            if (!string.IsNullOrEmpty(markdown))
+                            {
+                                Console.WriteLine($"    Markdown length: {markdown.Length} characters");
+                                // Save markdown to file
+                                string markdownFile = $"table_{i + 1}_page0.md";
+                                File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+                                Console.WriteLine($"    Markdown saved to: {markdownFile}");
+                                
+                                // Show preview
+                                int previewLength = Math.Min(200, markdown.Length);
+                                Console.WriteLine($"    Preview (first {previewLength} chars):");
+                                Console.WriteLine($"    {markdown.Substring(0, previewLength)}...");
+                            }
+                        }
+                        catch (Exception ex)
+                        {
+                            Console.WriteLine($"    Error converting to markdown: {ex.Message}");
+                        }
+                    }
+                }
+                else
+                {
+                    Console.WriteLine("No tables found. Trying with 'lines' strategy...");
+                    
+                    // Test 2: Try with 'lines' strategy (less strict)
+                    Console.WriteLine("\n--- Test 2: Get tables with 'lines' strategy ---");
+                    tables = Utils.GetTables(
+                        page, 
+                        clip: page.Rect, 
+                        vertical_strategy: "lines",
+                        horizontal_strategy: "lines");
+
+                    Console.WriteLine($"Found {tables.Count} table(s) with 'lines' strategy");
+                }
+
+                // Test 3: Try with 'text' strategy
+                Console.WriteLine("\n--- Test 3: Get tables with 'text' strategy ---");
+                List<Table> textTables = Utils.GetTables(
+                    page, 
+                    clip: page.Rect, 
+                    vertical_strategy: "text",
+                    horizontal_strategy: "text");
+
+                Console.WriteLine($"Found {textTables.Count} table(s) with 'text' strategy");
+
+                // Test 4: Get tables from all pages
+                Console.WriteLine("\n--- Test 4: Get tables from all pages ---");
+                int totalTables = 0;
+                for (int pageNum = 0; pageNum < doc.PageCount; pageNum++)
+                {
+                    Page currentPage = doc[pageNum];
+                    List<Table> pageTables = Utils.GetTables(
+                        currentPage, 
+                        clip: currentPage.Rect, 
+                        vertical_strategy: "lines_strict",
+                        horizontal_strategy: "lines_strict");
+                    
+                    if (pageTables.Count > 0)
+                    {
+                        Console.WriteLine($"  Page {pageNum}: {pageTables.Count} table(s)");
+                        totalTables += pageTables.Count;
+                    }
+                    currentPage.Dispose();
+                }
+                Console.WriteLine($"Total tables found across all pages: {totalTables}");
+
+                page.Dispose();
+                doc.Close();
+                
+                Console.WriteLine("\n=== TestTable completed successfully ===");
+            }
+            catch (Exception ex)
+            {
+                Console.WriteLine($"Error in TestTable: {ex.Message}");
+                Console.WriteLine($"Stack trace: {ex.StackTrace}");
+                throw;
+            }
+        }
+
+        static void TestPyMuPdfRagToMarkdown()
+        {
+            Console.WriteLine("\n=== TestPyMuPdfRagToMarkdown =======================");
+
+            try
+            {
+                // Find a test PDF file
+                string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf");
+
+                Document doc = new Document(testFilePath);
+                Console.WriteLine($"Document loaded: {doc.PageCount} page(s)");
+                Console.WriteLine($"Document name: {doc.Name}");
+
+                // Test 1: Basic ToMarkdown with default settings
+                Console.WriteLine("\n--- Test 1: Basic ToMarkdown (default settings) ---");
+                try
+                {
+                    string markdown = MuPdfRag.ToMarkdown(
+                        doc,
+                        pages: null, // All pages
+                        hdrInfo: null, // Auto-detect headers
+                        writeImages: false,
+                        embedImages: false,
+                        ignoreImages: false,
+                        ignoreGraphics: false,
+                        detectBgColor: true,
+                        imagePath: "",
+                        imageFormat: "png",
+                        imageSizeLimit: 0.05f,
+                        filename: testFilePath,
+                        forceText: true,
+                        pageChunks: false,
+                        pageSeparators: false,
+                        margins: null,
+                        dpi: 150,
+                        pageWidth: 612,
+                        pageHeight: null,
+                        tableStrategy: "lines_strict",
+                        graphicsLimit: null,
+                        fontsizeLimit: 3.0f,
+                        ignoreCode: false,
+                        extractWords: false,
+                        showProgress: false,
+                        useGlyphs: false,
+                        ignoreAlpha: false
+                    );
+
+                    string markdownFile = "TestPyMuPdfRag_Output.md";
+                    File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+                    Console.WriteLine($"Markdown output saved to: {markdownFile}");
+                    Console.WriteLine($"Markdown length: {markdown.Length} characters");
+                    if (markdown.Length > 0)
+                    {
+                        int previewLength = Math.Min(300, markdown.Length);
+                        Console.WriteLine($"Preview (first {previewLength} chars):\n{markdown.Substring(0, previewLength)}...");
+                    }
+                }
+                catch (Exception ex)
+                {
+                    Console.WriteLine($"Error in basic ToMarkdown: {ex.Message}");
+                }
+
+                // Test 2: ToMarkdown with IdentifyHeaders
+                Console.WriteLine("\n--- Test 2: ToMarkdown with IdentifyHeaders ---");
+                try
+                {
+                    var identifyHeaders = new IdentifyHeaders(doc, pages: null, bodyLimit: 12.0f, maxLevels: 6);
+                    string markdown = MuPdfRag.ToMarkdown(
+                        doc,
+                        pages: new List<int> { 0 }, // First page only
+                        hdrInfo: identifyHeaders,
+                        writeImages: false,
+                        embedImages: false,
+                        ignoreImages: false,
+                        filename: testFilePath,
+                        forceText: true,
+                        showProgress: false
+                    );
+
+                    string markdownFile = "TestPyMuPdfRag_WithHeaders.md";
+                    File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+                    Console.WriteLine($"Markdown with headers saved to: {markdownFile}");
+                    Console.WriteLine($"Markdown length: {markdown.Length} characters");
+                }
+                catch (Exception ex)
+                {
+                    Console.WriteLine($"Error in ToMarkdown with IdentifyHeaders: {ex.Message}");
+                }
+
+                // Test 3: ToMarkdown with TocHeaders
+                Console.WriteLine("\n--- Test 3: ToMarkdown with TocHeaders ---");
+                try
+                {
+                    var tocHeaders = new TocHeaders(doc);
+                    string markdown = MuPdfRag.ToMarkdown(
+                        doc,
+                        pages: new List<int> { 0 }, // First page only
+                        hdrInfo: tocHeaders,
+                        writeImages: false,
+                        embedImages: false,
+                        ignoreImages: false,
+                        filename: testFilePath,
+                        forceText: true,
+                        showProgress: false
+                    );
+
+                    string markdownFile = "TestPyMuPdfRag_WithToc.md";
+                    File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+                    Console.WriteLine($"Markdown with TOC headers saved to: {markdownFile}");
+                    Console.WriteLine($"Markdown length: {markdown.Length} characters");
+                }
+                catch (Exception ex)
+                {
+                    Console.WriteLine($"Error in ToMarkdown with TocHeaders: {ex.Message}");
+                }
+
+                // Test 4: ToMarkdown with page separators
+                Console.WriteLine("\n--- Test 4: ToMarkdown with page separators ---");
+                try
+                {
+                    string markdown = MuPdfRag.ToMarkdown(
+                        doc,
+                        pages: null, // All pages
+                        hdrInfo: null,
+                        writeImages: false,
+                        embedImages: false,
+                        ignoreImages: false,
+                        filename: testFilePath,
+                        forceText: true,
+                        pageSeparators: true, // Add page separators
+                        showProgress: false
+                    );
+
+                    string markdownFile = "TestPyMuPdfRag_WithSeparators.md";
+                    File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+                    Console.WriteLine($"Markdown with page separators saved to: {markdownFile}");
+                    Console.WriteLine($"Markdown length: {markdown.Length} characters");
+                }
+                catch (Exception ex)
+                {
+                    Console.WriteLine($"Error in ToMarkdown with page separators: {ex.Message}");
+                }
+
+                // Test 5: ToMarkdown with progress bar
+                Console.WriteLine("\n--- Test 5: ToMarkdown with progress bar ---");
+                try
+                {
+                    string markdown = MuPdfRag.ToMarkdown(
+                        doc,
+                        pages: null, // All pages
+                        hdrInfo: null,
+                        writeImages: false,
+                        embedImages: false,
+                        ignoreImages: false,
+                        filename: testFilePath,
+                        forceText: true,
+                        showProgress: true, // Show progress bar
+                        pageSeparators: false
+                    );
+
+                    string markdownFile = "TestPyMuPdfRag_WithProgress.md";
+                    File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+                    Console.WriteLine($"\nMarkdown with progress saved to: {markdownFile}");
+                    Console.WriteLine($"Markdown length: {markdown.Length} characters");
+                }
+                catch (Exception ex)
+                {
+                    Console.WriteLine($"Error in ToMarkdown with progress: {ex.Message}");
+                }
+
+                doc.Close();
+            }
+            catch (Exception ex)
+            {
+                Console.WriteLine($"An unexpected error occurred during PyMuPdfRag test: {ex.Message}");
+                Console.WriteLine($"Stack trace: {ex.StackTrace}");
+            }
+
+            Console.WriteLine("\n=== TestPyMuPdfRagToMarkdown Completed =======================");
+        }
+
+        static void TestLLM()
+        {
+            Console.WriteLine("\n=== TestLLM =======================");
+
+            try
+            {
+                // Display version information
+                Console.WriteLine($"MuPDF.NET4LLM Version: {MuPDF4LLM.Version}");
+                var versionTuple = MuPDF4LLM.VersionTuple;
+                Console.WriteLine($"Version Tuple: ({versionTuple.major}, {versionTuple.minor}, {versionTuple.patch})");
+
+                // Test with a sample PDF file
+                string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf");
+
+                // Try to find a PDF with actual content if Blank.pdf doesn't work well
+                if (!File.Exists(testFilePath))
+                {
+                    testFilePath = Path.GetFullPath("../../../TestDocuments/Widget.pdf");
+                }
+
+                if (!File.Exists(testFilePath))
+                {
+                    Console.WriteLine($"Test PDF file not found. Skipping LLM test.");
+                    return;
+                }
+
+                Console.WriteLine($"\nTesting with PDF: {testFilePath}");
+
+                Document doc = new Document(testFilePath);
+                Console.WriteLine($"Document loaded: {doc.PageCount} page(s)");
+
+                doc.Close();
+                Console.WriteLine("\nLLM test completed successfully.");
+            }
+            catch (Exception ex)
+            {
+                Console.WriteLine($"Error in TestLLM: {ex.Message}");
+                Console.WriteLine($"Stack trace: {ex.StackTrace}");
+            }
+        }
+
         static void TestIssue1880()
         {
             Console.WriteLine("\n=== TestIssue1880 =======================");
diff --git a/Demo/TestDocuments/Magazine.pdf b/Demo/TestDocuments/Magazine.pdf
new file mode 100644
index 0000000..c8e166e
Binary files /dev/null and b/Demo/TestDocuments/Magazine.pdf differ
diff --git a/Demo/TestDocuments/national-capitals.pdf b/Demo/TestDocuments/national-capitals.pdf
new file mode 100644
index 0000000..d2b4721
Binary files /dev/null and b/Demo/TestDocuments/national-capitals.pdf differ
diff --git a/Demo/annotations-freetext2.cs b/Demo/annotations-freetext2.cs
index fa5493c..fb3a0e3 100644
--- a/Demo/annotations-freetext2.cs
+++ b/Demo/annotations-freetext2.cs
@@ -25,7 +25,7 @@ public static void Run(string[] args)
 
             // the annotation text with HTML and styling syntax
             string text = $@"<p style=""text-align:justify;margin-top:-25px;"">
-PyMuPDF <span style=""color: red;"">འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན།</span>
+MuPDF.NET <span style=""color: red;"">འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན།</span>
 <span style=""color:blue;"">Here is some <b>bold</b> and <i>italic</i> text, followed by <b><i>bold-italic</i></b>. Text-based check boxes: {bullet}.</span>
  </p>";
 
diff --git a/MuPDF.NET.Test/AnnotTest.cs b/MuPDF.NET.Test/AnnotTest.cs
index f34a372..9573c38 100644
--- a/MuPDF.NET.Test/AnnotTest.cs
+++ b/MuPDF.NET.Test/AnnotTest.cs
@@ -334,7 +334,7 @@ public void TestRichText()
             string bullet = "\u2610\u2611\u2612"; // Output: ☐☑☒;
 
             string text = $@"<p style=""text-align:justify;margin-top:-25px;"">
-PyMuPDF <span style=""color: red;"">འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན།</span>
+MuPDF.NET <span style=""color: red;"">འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན།</span>
 <span style=""color:blue;"">Here is some <b>bold</b> and <i>italic</i> text, followed by <b><i>bold-italic</i></b>. Text-based check boxes: {bullet}.</span>
 </p>";
 
diff --git a/MuPDF.NET.Test/GeneralTest.cs b/MuPDF.NET.Test/GeneralTest.cs
index 2c63deb..82dfe02 100644
--- a/MuPDF.NET.Test/GeneralTest.cs
+++ b/MuPDF.NET.Test/GeneralTest.cs
@@ -486,14 +486,6 @@ assert repr(ee) == expected, f'Expected {expected=} but got {repr(ee)=}.'
             {
                 Console.WriteLine($"test_2548(): {Utils.MUPDF_WARNINGS_STORE[i]}");
             }
-
-            // This checks that PyMuPDF 1.23.7 fixes this bug, and also that earlier
-            // versions with updated MuPDF also fix the bug.
-            //rebased = hasattr(pymupdf, 'mupdf')
-            //expected = 'format error: cycle in structure tree\nstructure tree broken, assume tree is missing'
-            //if rebased:
-            //    assert wt == expected, f'expected:\n    {expected!r}\nwt:\n    {wt!r}\n'
-            //assert not e
         }
 
         [Test]
diff --git a/MuPDF.NET.Test/resources/test_1645_expected.pdf b/MuPDF.NET.Test/resources/test_1645_expected.pdf
index 1196788..55f59f4 100644
Binary files a/MuPDF.NET.Test/resources/test_1645_expected.pdf and b/MuPDF.NET.Test/resources/test_1645_expected.pdf differ
diff --git a/MuPDF.NET.sln b/MuPDF.NET.sln
index 3a66542..755bc8f 100644
--- a/MuPDF.NET.sln
+++ b/MuPDF.NET.sln
@@ -1,4 +1,4 @@
-﻿
+
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 17
 VisualStudioVersion = 17.14.36511.14
@@ -9,6 +9,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MuPDF.NET.Test", "MuPDF.NET
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Demo", "Demo\Demo.csproj", "{D1CCB24F-A868-F185-9228-8CC249247C79}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MuPDF.NET4LLM", "MuPDF.NET4LLM\MuPDF.NET4LLM.csproj", "{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MuPDF.NET4LLM.Test", "MuPDF.NET4LLM.Test\MuPDF.NET4LLM.Test.csproj", "{5498436C-E1C0-418D-9DA3-0460A3C15953}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -55,6 +59,30 @@ Global
 		{D1CCB24F-A868-F185-9228-8CC249247C79}.Release|x64.Build.0 = Release|x64
 		{D1CCB24F-A868-F185-9228-8CC249247C79}.Release|x86.ActiveCfg = Release|x86
 		{D1CCB24F-A868-F185-9228-8CC249247C79}.Release|x86.Build.0 = Release|x86
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x64.ActiveCfg = Debug|x64
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x64.Build.0 = Debug|x64
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x86.ActiveCfg = Debug|x86
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x86.Build.0 = Debug|x86
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|Any CPU.Build.0 = Release|Any CPU
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x64.ActiveCfg = Release|x64
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x64.Build.0 = Release|x64
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x86.ActiveCfg = Release|x86
+		{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x86.Build.0 = Release|x86
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x64.Build.0 = Debug|Any CPU
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x86.Build.0 = Debug|Any CPU
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|Any CPU.Build.0 = Release|Any CPU
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x64.ActiveCfg = Release|Any CPU
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x64.Build.0 = Release|Any CPU
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x86.ActiveCfg = Release|Any CPU
+		{5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x86.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/MuPDF.NET/Document.cs b/MuPDF.NET/Document.cs
index 0622056..864e4d0 100644
--- a/MuPDF.NET/Document.cs
+++ b/MuPDF.NET/Document.cs
@@ -1685,7 +1685,7 @@ private void _DeletePage(int pno)
         /// Create a table of contents.
         /// </summary>
         /// <param name="simple">a bool to control output.</param>
-        /// <returns>Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation.</returns>
+        /// <returns>Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see MuPDF's documentation.</returns>
         /// <exception cref="Exception"></exception>
         public List<Toc> GetToc(bool simple = true)
         {
@@ -5950,6 +5950,23 @@ public void Bake(bool annots = true, bool widgets = true)
             pdf.Dispose();
         }
 
+        public void Dispose()
+        {
+            if (IsClosed)
+                throw new Exception("document closed");
+
+            if (Outline != null)
+            {
+                Outline.Dispose();
+                Outline = null;
+            }
+            ResetPageRefs();
+            IsClosed = true;
+            GraftMaps = new Dictionary<int, GraftMap>();
+            _nativeDocument.Dispose();
+            _nativeDocument = null;
+        }
+
         public void Close()
         {
             if (IsClosed)
@@ -6000,7 +6017,7 @@ public int AddOcg(
             PdfObj useFor = ocg.pdf_dict_put_dict(new PdfObj("Usage"), 3);
             PdfObj ciName = mupdf.mupdf.pdf_new_name("CreatorInfo");
             PdfObj creInfo = useFor.pdf_dict_put_dict(ciName, 2);
-            creInfo.pdf_dict_put_text_string(new PdfObj("Creator"), "PyMuPDF");
+            creInfo.pdf_dict_put_text_string(new PdfObj("Creator"), "MuPDF");
 
             if (!string.IsNullOrEmpty(usage))
                 creInfo.pdf_dict_put_name(new PdfObj("Subtype"), usage);
diff --git a/MuPDF.NET/Page.cs b/MuPDF.NET/Page.cs
index 1f192d9..83bf2cd 100644
--- a/MuPDF.NET/Page.cs
+++ b/MuPDF.NET/Page.cs
@@ -1,4 +1,4 @@
-﻿using mupdf;
+using mupdf;
 using SkiaSharp;
 using System;
 using System.Collections;
@@ -10,7 +10,6 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Text;
-using static MuPDF.NET.Global;
 using static System.Net.Mime.MediaTypeNames;
 
 namespace MuPDF.NET
diff --git a/MuPDF.NET/Table.cs b/MuPDF.NET/Table.cs
index 5fbcbd3..3d432f2 100644
--- a/MuPDF.NET/Table.cs
+++ b/MuPDF.NET/Table.cs
@@ -1,2007 +1,2296 @@
-﻿using System;
+/*
+Copyright (C) 2023 Artifex Software, Inc.
+
+This file is part of MuPDF.NET.
+
+MuPDF.NET is free software: you can redistribute it and/or modify it under the
+terms of the GNU Affero General Public License as published by the Free
+Software Foundation, either version 3 of the License, or (at your option)
+any later version.
+
+MuPDF.NET is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+details.
+
+You should have received a copy of the GNU Affero General Public License
+along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+
+Alternative licensing terms are available from the licensor.
+For commercial licensing, see <https://www.artifex.com/> or contact
+Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+CA 94129, USA, for further information.
+
+---------------------------------------------------------------------
+Portions of this code have been ported from pdfplumber, see
+https://pypi.org/project/pdfplumber/.
+
+The ported code is under the following MIT license:
+
+---------------------------------------------------------------------
+The MIT License (MIT)
+
+Copyright (c) 2015, Jeremy Singer-Vine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+---------------------------------------------------------------------
+Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt
+---------------------------------------------------------------------
+
+The porting mainly pertains to files "table.py" and relevant parts of
+"utils/text.py" within pdfplumber's repository on Github.
+With respect to "text.py", we have removed functions or features that are not
+used by table processing. Examples are:
+
+* the text search function
+* simple text extraction
+* text extraction by lines
+
+Original pdfplumber code does neither detect, nor identify table headers.
+This MuPDF.NET port adds respective code to the 'Table' class as method '_get_header'.
+This is implemented as new class TableHeader with the properties:
+* bbox: A tuple for the header's bbox
+* cells: A tuple for each bbox of a column header
+* names: A list of strings with column header text
+* external: A bool indicating whether the header is outside the table cells.
+
+*/
+
+using mupdf;
+using System;
 using System.Collections.Generic;
-using System.Data;
+using System.Drawing;
 using System.Linq;
-using System.Net;
-using System.Reflection;
 using System.Text;
-using static MuPDF.NET.Global;
+using System.Text.RegularExpressions;
 
 namespace MuPDF.NET
 {
-    public class Global
+    // Global state for table processing
+    internal static class TableGlobals
     {
-        public class Edge
-        {
-            public float x0;
-            public float y0;
-            public float x1;
-            public float y1;
-            public float width;
-            public float height;
-            public Point[] pts;
-            public float linewidth; 
-            public bool stroke;
-            public bool fill;
-            public bool evenodd;
-            public float[] stroking_color;
-            public float[] non_stroking_color;
-            public string object_type;
-            public int page_number;
-            public object stroking_pattern;
-            public object non_stroking_pattern;
-            public float top;
-            public float bottom;
-            public float doctop;
-            public string orientation;
-        }
-
-        public class Character
-        {
-            public float adv;
-            public float bottom;
-            public float doctop;
-            public string fontname;
-            public float height;
-            public Matrix matrix;
-            public string ncs;
-            public int non_stroking_color;
-            public object non_stroking_pattern;
-            public string object_type;
-            public int page_number;
-            public float size;
-            public int stroking_color;
-            public object stroking_pattern;
-            public string text;
-            public float top;
-            public bool upright;
-            public int direction;
-            public int rotation;
-            public float width;
-            public float x0;
-            public float x1;
-            public float y0;
-            public float y1;
-        }
-
-        // Function to check if the extracted text contains only whitespace characters
-        public static bool whiteSpaces_issuperset(string text)
-        {
-            HashSet<char> whiteSpaces = new HashSet<char>(new[] {
-                ' ', '\t', '\n', '\r', '\v', '\f'
-            });
-            // Check if all characters in the extracted text are whitespace characters
-            return text.All(c => whiteSpaces.Contains(c));
-        }
+        internal static List<Edge> EDGES = new List<Edge>();  // vector graphics from MuPDF
+        internal static List<CharDict> CHARS = new List<CharDict>();  // text characters from MuPDF
+        internal static TextPage TEXTPAGE = null;  // textpage for cell text extraction
+        
+        // Constants matching Python implementation from __init__.py
+        internal static readonly HashSet<char> WHITE_SPACES = new HashSet<char> { ' ', '\t', '\n', '\r', '\f', '\v' };
+        // From __init__.py: TEXT_FONT_BOLD = 16, but for char flags use FZ_STEXT_BOLD
+        internal static readonly int TEXT_BOLD = (int)mupdf.mupdf.FZ_STEXT_BOLD;
+        // From __init__.py: TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
+        internal static readonly int TEXT_STRIKEOUT = (int)mupdf.mupdf.FZ_STEXT_STRIKEOUT;
+        // From __init__.py: TEXT_COLLECT_STYLES = mupdf.FZ_STEXT_COLLECT_STYLES
+        internal static readonly int TEXT_COLLECT_STYLES = (int)mupdf.mupdf.FZ_STEXT_COLLECT_STYLES;
+        // From __init__.py: TEXT_COLLECT_VECTORS = mupdf.FZ_STEXT_COLLECT_VECTORS
+        internal static readonly int TEXT_COLLECT_VECTORS = (int)mupdf.mupdf.FZ_STEXT_COLLECT_VECTORS;
+        // From __init__.py: TEXT_SEGMENT = mupdf.FZ_STEXT_SEGMENT
+        internal static readonly int TEXT_SEGMENT = (int)mupdf.mupdf.FZ_STEXT_SEGMENT;
+        // From table.py FLAGS: TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES | TEXT_ACCURATE_BBOXES | TEXT_MEDIABOX_CLIP
+        internal static readonly int FLAGS = 
+            (int)TextFlagsExtension.TEXTFLAGS_TEXT |
+            TEXT_COLLECT_STYLES |
+            (int)TextFlags.TEXT_ACCURATE_BBOXES |
+            (int)TextFlags.TEXT_MEDIABOX_CLIP;
+        // From table.py TABLE_DETECTOR_FLAGS: TEXT_ACCURATE_BBOXES | TEXT_SEGMENT | TEXT_COLLECT_VECTORS | TEXT_MEDIABOX_CLIP
+        internal static readonly int TABLE_DETECTOR_FLAGS =
+            (int)TextFlags.TEXT_ACCURATE_BBOXES |
+            TEXT_SEGMENT |
+            TEXT_COLLECT_VECTORS |
+            (int)TextFlags.TEXT_MEDIABOX_CLIP;
+    }
 
-        public class BBox
+    // Constants
+    internal static class TableConstants
+    {
+        internal static readonly string[] NON_NEGATIVE_SETTINGS = {
+            "snap_tolerance", "snap_x_tolerance", "snap_y_tolerance",
+            "join_tolerance", "join_x_tolerance", "join_y_tolerance",
+            "edge_min_length", "min_words_vertical", "min_words_horizontal",
+            "intersection_tolerance", "intersection_x_tolerance", "intersection_y_tolerance"
+        };
+
+        internal static readonly Dictionary<string, string> LIGATURES = new Dictionary<string, string>
         {
-            public float x0 { get; set; }
-            public float top { get; set; }
-            public float x1 { get; set; }
-            public float bottom { get; set; }
+            { "ﬀ", "ff" },
+            { "ﬃ", "ffi" },
+            { "ﬄ", "ffl" },
+            { "ﬁ", "fi" },
+            { "ﬂ", "fl" },
+            { "ﬆ", "st" },
+            { "ﬅ", "st" }
+        };
+    }
 
-            public BBox(float x0, float top, float x1, float bottom)
-            {
-                this.x0 = x0;
-                this.top = top;
-                this.x1 = x1;
-                this.bottom = bottom;
-            }
+    // Character dictionary structure matching Python implementation
+    internal class CharDict
+    {
+        public float adv { get; set; }
+        public float bottom { get; set; }
+        public float doctop { get; set; }
+        public string fontname { get; set; }
+        public float height { get; set; }
+        public Tuple<float, float, float, float, float, float> matrix { get; set; }
+        public string ncs { get; set; }
+        public Tuple<float, float, float> non_stroking_color { get; set; }
+        public object non_stroking_pattern { get; set; }
+        public string object_type { get; set; }
+        public int page_number { get; set; }
+        public float size { get; set; }
+        public Tuple<float, float, float> stroking_color { get; set; }
+        public object stroking_pattern { get; set; }
+        public bool bold { get; set; }
+        public string text { get; set; }
+        public float top { get; set; }
+        public bool upright { get; set; }
+        public float width { get; set; }
+        public float x0 { get; set; }
+        public float x1 { get; set; }
+        public float y0 { get; set; }
+        public float y1 { get; set; }
+    }
 
-            // Union method: Combine two rectangles into one that covers both.
-            public BBox Union(BBox other)
-            {
-                float newX0 = Math.Min(this.x0, other.x0);
-                float newTop = Math.Min(this.top, other.top);
-                float newX1 = Math.Max(this.x1, other.x1);
-                float newBottom = Math.Max(this.bottom, other.bottom);
+    // Edge structure for table detection
+    public class Edge
+    {
+        public float x0 { get; set; }
+        public float x1 { get; set; }
+        public float top { get; set; }
+        public float bottom { get; set; }
+        public float width { get; set; }
+        public float height { get; set; }
+        public string orientation { get; set; }  // "h" or "v"
+        public string object_type { get; set; }
+        public float doctop { get; set; }
+        public int page_number { get; set; }
+        public float y0 { get; set; }
+        public float y1 { get; set; }
+    }
 
-                return new BBox(newX0, newTop, newX1, newBottom);
-            }
+    // Helper functions
+    internal static class TableHelpers
+    {
+        // rect_in_rect - Check whether rectangle 'inner' is fully inside rectangle 'outer'
+        internal static bool RectInRect(Rect inner, Rect outer)
+        {
+            return inner.X0 >= outer.X0 && inner.Y0 >= outer.Y0 &&
+                   inner.X1 <= outer.X1 && inner.Y1 <= outer.Y1;
+        }
 
-            // Overload the |= operator to union two rectangles.
-            public static BBox operator |(BBox r1, BBox r2)
-            {
-                return r1.Union(r2);
-            }
+        // chars_in_rect - Check whether any of the chars are inside rectangle
+        internal static bool CharsInRect(List<CharDict> chars, Rect rect)
+        {
+            return chars.Any(c =>
+                rect.X0 <= c.x0 && c.x1 <= rect.X1 &&
+                rect.Y0 <= c.y0 && c.y1 <= rect.Y1);
+        }
 
-            public bool IsEmpty()
-            {
-                if (x0 == 0 && top == 0 && x1 == 0 && bottom == 0)
-                    return true;
-                return false;
-            }
+        // _iou - Compute intersection over union of two rectangles
+        internal static float Iou(Rect r1, Rect r2)
+        {
+            float ix = Math.Max(0, Math.Min(r1.X1, r2.X1) - Math.Max(r1.X0, r2.X0));
+            float iy = Math.Max(0, Math.Min(r1.Y1, r2.Y1) - Math.Max(r1.Y0, r2.Y0));
+            float intersection = ix * iy;
+            if (intersection == 0)
+                return 0;
+            float area1 = (r1.X1 - r1.X0) * (r1.Y1 - r1.Y0);
+            float area2 = (r2.X1 - r2.X0) * (r2.Y1 - r2.Y0);
+            return intersection / (area1 + area2 - intersection);
+        }
 
-            // Override Equals and GetHashCode for Distinct to work correctly
-            public override bool Equals(object obj)
-            {
-                return obj is BBox bbox &&
-                       x0 == bbox.x0 &&
-                       top == bbox.top &&
-                       x1 == bbox.x1 &&
-                       bottom == bbox.bottom;
-            }
+        // intersects_words_h - Check whether any words are cut through by horizontal line y
+        internal static bool IntersectsWordsH(Rect bbox, float y, List<Rect> wordRects)
+        {
+            return wordRects.Any(r => RectInRect(r, bbox) && r.Y0 < y && y < r.Y1);
+        }
 
-            public static int CombineHashCodes(float x0, float top, float x1, float bottom)
-            {
-                // Start with a prime number to mix in the values.
-                int hash = 17;
+        // get_table_dict_from_rect - Extract MuPDF table structure information
+        // Note: This requires native MuPDF interop to call fz_find_table_within_bounds
+        // The Python version calls: pymupdf.extra.make_table_dict(textpage.this.m_internal, table_dict, rect)
+        // This would need to be implemented via P/Invoke or native wrapper
+        internal static Dictionary<string, object> GetTableDictFromRect(TextPage textpage, Rect rect)
+        {
+            var tableDict = new Dictionary<string, object>();
+            // TODO: Implement native interop call to MuPDF's table detection function
+            // This is used by make_table_from_bbox which is called when layout_information finds tables
+            return tableDict;
+        }
 
-                // Combine each hash code using XOR and a prime number multiplier.
-                hash = hash * 31 + x0.GetHashCode();
-                hash = hash * 31 + top.GetHashCode();
-                hash = hash * 31 + x1.GetHashCode();
-                hash = hash * 31 + bottom.GetHashCode();
+        // make_table_from_bbox - Detect table structure within a given rectangle
+        internal static List<Rect> MakeTableFromBbox(TextPage textpage, List<Rect> wordRects, Rect rect)
+        {
+            var cells = new List<Rect>();
+            var block = GetTableDictFromRect(textpage, rect);
+            
+            if (!block.ContainsKey("type") || Convert.ToInt32(block["type"]) != mupdf.mupdf.FZ_STEXT_BLOCK_GRID)
+                return cells;
+
+            var bboxList = block["bbox"] as List<object>;
+            if (bboxList == null || bboxList.Count < 4)
+                return cells;
+
+            var bbox = new Rect(
+                Convert.ToSingle(bboxList[0]),
+                Convert.ToSingle(bboxList[1]),
+                Convert.ToSingle(bboxList[2]),
+                Convert.ToSingle(bboxList[3])
+            );
 
-                return hash;
+            var xpos = (block["xpos"] as List<object>)?.Cast<List<object>>()
+                .Select(x => Tuple.Create(Convert.ToSingle(x[0]), Convert.ToSingle(x[1])))
+                .OrderBy(x => x.Item1).ToList() ?? new List<Tuple<float, float>>();
+            
+            var ypos = (block["ypos"] as List<object>)?.Cast<List<object>>()
+                .Select(y => Tuple.Create(Convert.ToSingle(y[0]), Convert.ToSingle(y[1])))
+                .OrderBy(y => y.Item1).ToList() ?? new List<Tuple<float, float>>();
+
+            var maxUncertain = block["max_uncertain"] as List<object>;
+            float xmaxu = maxUncertain != null && maxUncertain.Count > 0 ? Convert.ToSingle(maxUncertain[0]) : 0;
+            float ymaxu = maxUncertain != null && maxUncertain.Count > 1 ? Convert.ToSingle(maxUncertain[1]) : 0;
+
+            // Modify ypos to remove uncertain positions
+            var nypos = new List<float>();
+            foreach (var (y, yunc) in ypos)
+            {
+                if (yunc > 0) continue;
+                if (IntersectsWordsH(bbox, y, wordRects)) continue;
+                if (nypos.Count > 0 && (y - nypos[nypos.Count - 1] < 3))
+                    nypos[nypos.Count - 1] = y;
+                else
+                    nypos.Add(y);
             }
 
-            public override int GetHashCode()
-            {
-                //return HashCode.Combine(x0, top, x1, bottom);
-                return CombineHashCodes(x0, top, x1, bottom);
-            }
+            ymaxu = Math.Max(0, (float)Math.Round((nypos.Count - 2) * 0.35));
 
-            public static BBox RectToBBox(Rect rect)
-            {
-                return new BBox(rect.X0, rect.Y0, rect.X1, rect.Y1);
-            }
+            var nxpos = xpos.Where(x => x.Item2 <= ymaxu).Select(x => x.Item1).ToList();
+            if (bbox.X1 > nxpos[nxpos.Count - 1] + 3)
+                nxpos.Add(bbox.X1);
 
-            public static Rect BBoxToRect(BBox bbox)
+            // Compose cells from remaining x and y positions
+            for (int i = 0; i < nypos.Count - 1; i++)
             {
-                return new Rect(bbox.x0, bbox.top, bbox.x1, bbox.bottom);
+                var rowBox = new Rect(bbox.X0, nypos[i], bbox.X1, nypos[i + 1]);
+                var rowWords = wordRects.Where(r => RectInRect(r, rowBox))
+                    .OrderBy(r => r.X0).ToList();
+                
+                var thisXpos = nxpos.Where(x => !rowWords.Any(r => r.X0 < x && x < r.X1)).ToList();
+                
+                for (int j = 0; j < thisXpos.Count - 1; j++)
+                {
+                    var cell = new Rect(thisXpos[j], nypos[i], thisXpos[j + 1], nypos[i + 1]);
+                    if (!cell.IsEmpty)
+                        cells.Add(cell);
+                }
             }
+
+            return cells;
         }
 
-        public static Edge line_to_edge(Edge line)
+        // extract_cells - Extract text from a cell as plain or MD styled text
+        internal static string ExtractCells(TextPage textpage, Rect cell, bool markdown = false)
         {
-            // Create a new dictionary to hold the edge data
-            var edge = line;
-
-            // Determine the orientation
-            string orientation = (Convert.ToSingle(line.top) == Convert.ToSingle(line.bottom)) ? "h" : "v";
+            if (textpage == null)
+                return "";
 
-            // Add or update the "orientation" key in the dictionary
-            edge.orientation = orientation;
+            var text = new StringBuilder();
+            var pageInfo = textpage.ExtractRAWDict(cropbox: null, sort: false);
 
-            return edge;
-        }
+            if (pageInfo?.Blocks == null)
+                return "";
 
-        public static List<Edge> rect_to_edges(Edge rect)
-        {
-            Edge top = new Edge
+            foreach (var block in pageInfo.Blocks)
             {
-                object_type = "rect_edge",
-                height = 0,
-                y0 = rect.y1,
-                bottom = rect.top,
-                orientation = "h"
-            };
+                if (block.Type != 0) continue;
 
-            Edge bottom = new Edge
-            {
-                object_type = "rect_edge",
-                height = 0,
-                y1 = rect.y0,
-                top = rect.top + rect.height,
-                doctop = rect.doctop + rect.height,
-                orientation = "h"
-            };
+                var blockBbox = block.Bbox;
+                if (blockBbox == null) continue;
 
-            Edge left = new Edge
-            {
-                object_type = "rect_edge",
-                width = 0,
-                x1 = rect.x0,
-                orientation = "v"
-            };
+                if (blockBbox.X0 > cell.X1 || blockBbox.X1 < cell.X0 ||
+                    blockBbox.Y0 > cell.Y1 || blockBbox.Y1 < cell.Y0)
+                    continue;
 
-            Edge right = new Edge
-            {
-                object_type = "rect_edge",
-                width = 0,
-                x0 = rect.x1,
-                orientation = "v"
-            };
+                if (block.Lines == null) continue;
 
-            return new List<Edge> { top, bottom, left, right };
-        }
+                foreach (var line in block.Lines)
+                {
+                    if (line.Bbox == null) continue;
 
-        public static List<Edge> curve_to_edges(Edge curve)
-        {
-            // Extract points and other properties from the curve
-            Point[] points = curve.pts;
+                    var lbbox = line.Bbox;
+                    if (lbbox.X0 > cell.X1 || lbbox.X1 < cell.X0 ||
+                        lbbox.Y0 > cell.Y1 || lbbox.Y1 < cell.Y0)
+                        continue;
 
-            var edges = new List<Edge>();
+                    if (text.Length > 0)
+                        text.Append(markdown ? "<br>" : "\n");
 
-            for (int i = 0; i < points.Length - 1; i++)
-            {
-                Point p0 = points[i];
-                Point p1 = points[i + 1];
+                    var lineDir = line.Dir;
+                    bool horizontal = lineDir != null &&
+                        (lineDir.X == 0 && lineDir.Y == 1 || lineDir.X == 1 && lineDir.Y == 0);
 
-                var edge = new Edge
-                {
-                    object_type = "curve_edge",
-                    x0 = Math.Min(p0.X, p1.X),
-                    x1 = Math.Max(p0.X, p1.X),
-                    top = Math.Min(p0.Y, p1.Y),
-                    doctop = Math.Min(p0.Y, p1.Y) + (curve.doctop - curve.top),
-                    bottom = Math.Max(p0.Y, p1.Y),
-                    width = Math.Abs(p0.X - p1.X),
-                    height = Math.Abs(p0.Y - p1.Y),
-                    orientation = (p0.X == p1.X) ? "v" : (p0.Y == p1.Y) ? "h" : null
-                };
+                    if (line.Spans == null) continue;
+
+                    foreach (var span in line.Spans)
+                    {
+                        if (span.Bbox == null) continue;
+
+                        var sbbox = span.Bbox;
+                        if (sbbox.X0 > cell.X1 || sbbox.X1 < cell.X0 ||
+                            sbbox.Y0 > cell.Y1 || sbbox.Y1 < cell.Y0)
+                            continue;
+
+                        var spanText = new StringBuilder();
+                        if (span.Chars != null)
+                        {
+                            foreach (var char_ in span.Chars)
+                            {
+                                if (char_.Bbox == null) continue;
+
+                                var charRect = new Rect(char_.Bbox);
+                                var cellRect = new Rect(cell.X0, cell.Y0, cell.X1, cell.Y1);
+                                var intersection = charRect & cellRect;
+                                
+                                if (intersection != null && !intersection.IsEmpty && 
+                                    (intersection.Width * intersection.Height) > 0.5 * (charRect.Width * charRect.Height))
+                                {
+                                    spanText.Append(char_.C);
+                                }
+                                else if (TableGlobals.WHITE_SPACES.Contains(char_.C))
+                                {
+                                    spanText.Append(" ");
+                                }
+                            }
+                        }
+                        else if (!string.IsNullOrEmpty(span.Text))
+                        {
+                            spanText.Append(span.Text);
+                        }
 
-                edges.Add(edge);
+                        if (spanText.Length == 0) continue;
+
+                        if (!markdown)
+                        {
+                            text.Append(spanText);
+                            continue;
+                        }
+
+                        string prefix = "";
+                        string suffix = "";
+                        float flags = span.Flags;
+
+                        if (horizontal && ((int)flags & TableGlobals.TEXT_STRIKEOUT) != 0)
+                        {
+                            prefix += "~~";
+                            suffix = "~~" + suffix;
+                        }
+                        if (((int)flags & TableGlobals.TEXT_BOLD) != 0)
+                        {
+                            prefix += "**";
+                            suffix = "**" + suffix;
+                        }
+                        if (((int)flags & (int)FontStyle.TEXT_FONT_ITALIC) != 0)
+                        {
+                            prefix += "_";
+                            suffix = "_" + suffix;
+                        }
+                        if (((int)flags & (int)FontStyle.TEXT_FONT_MONOSPACED) != 0)
+                        {
+                            prefix += "`";
+                            suffix = "`" + suffix;
+                        }
+
+                        string spanTextStr = spanText.ToString();
+                        if (span.Chars != null && span.Chars.Count > 2)
+                            spanTextStr = spanTextStr.TrimEnd();
+
+                        if (suffix.Length > 0 && text.ToString().EndsWith(suffix))
+                        {
+                            text.Remove(text.Length - suffix.Length, suffix.Length);
+                            text.Append(spanTextStr + suffix);
+                        }
+                        else
+                        {
+                            if (string.IsNullOrWhiteSpace(spanTextStr))
+                                text.Append(" ");
+                            else
+                                text.Append(prefix + spanTextStr + suffix);
+                        }
+                    }
+                }
             }
 
-            return edges;
+            return text.ToString().Trim();
         }
 
-        public static List<Edge> obj_to_edges(Edge obj)
+        // to_list - Convert collection to list
+        internal static List<T> ToList<T>(object collection)
         {
-            string type = obj.object_type;
-
-            if (type.Contains("_edge"))
-            {
-                // If it's an edge object, return it as-is.
-                return new List<Edge> { obj };
-            }
-            else if (type == "line")
-            {
-                // If it's a line, process it using line_to_edge (you'll need to define line_to_edge method)
-                return new List<Edge> { line_to_edge(obj) };
-            }
-            else if (type == "rect")
-            {
-                return rect_to_edges(obj);
-            }
-            else if (type == "curve")
-            {
-                return curve_to_edges(obj);
-            }
-            return null;
+            if (collection is List<T> list)
+                return list;
+            if (collection is IEnumerable<T> enumerable)
+                return enumerable.ToList();
+            return new List<T> { (T)collection };
         }
 
-        // Filter edges based on orientation, type, and minimum length
-        public static List<Edge> filter_edges(
-            List<Edge> edges,
-            string orientation = null,
-            string edgeType = null,
-            float minLength = 1
-        )
+        // Helper function for clustering objects
+        internal static List<List<T>> ClusterObjects<T>(IEnumerable<T> xs, Func<T, float> keyFn, float tolerance)
         {
-            // Validate orientation
-            if (orientation != null && orientation != "v" && orientation != "h")
-            {
-                throw new ArgumentException("Orientation must be 'v' or 'h'");
-            }
+            if (tolerance == 0)
+                return xs.OrderBy(keyFn).Select(x => new List<T> { x }).ToList();
 
-            // Function to test if an edge meets the criteria
-            bool test(Edge e)
-            {
-                // Determine the dimension (width or height) based on orientation
-                float dimension = (e.orientation == "v") ? e.height : e.width;
+            var xsList = xs.ToList();
+            if (xsList.Count < 2)
+                return xsList.Select(x => new List<T> { x }).ToList();
 
-                bool etCorrect = edgeType == null || e.object_type == edgeType;
-                bool orientCorrect = orientation == null || e.orientation == orientation;
+            var values = xsList.Select(keyFn).Distinct().OrderBy(v => v).ToList();
+            var clusters = ClusterList(values, tolerance);
 
-                return etCorrect && orientCorrect && dimension >= minLength;
+            var clusterDict = new Dictionary<float, int>();
+            for (int i = 0; i < clusters.Count; i++)
+            {
+                foreach (var val in clusters[i])
+                    clusterDict[val] = i;
             }
 
-            // Use LINQ to filter edges
-            return edges.Where(test).ToList();
+            var grouped = xsList.GroupBy(x => clusterDict[keyFn(x)]).OrderBy(g => g.Key);
+            return grouped.Select(g => g.ToList()).ToList();
         }
 
-        public static List<List<float>> cluster_list(List<float> xs, float tolerance = 0f)
+        internal static List<List<float>> ClusterList(List<float> xs, float tolerance = 0)
         {
             if (tolerance == 0)
-            {
                 return xs.OrderBy(x => x).Select(x => new List<float> { x }).ToList();
-            }
 
             if (xs.Count < 2)
-            {
-                return xs.OrderBy(x => x).Select(x => new List<float> { x }).ToList();
-            }
+                return xs.Select(x => new List<float> { x }).ToList();
 
             var groups = new List<List<float>>();
-            xs.Sort();
-            var currentGroup = new List<float> { xs[0] };
-            float last = xs[0];
+            var sortedXs = xs.OrderBy(x => x).ToList();
+            var currentGroup = new List<float> { sortedXs[0] };
+            float last = sortedXs[0];
 
-            foreach (var x in xs.Skip(1))
+            for (int i = 1; i < sortedXs.Count; i++)
             {
-                if (x <= last + tolerance)
+                float x = sortedXs[i];
+                if (x <= (last + tolerance))
                 {
                     currentGroup.Add(x);
                 }
                 else
                 {
-                    groups.Add(new List<float>(currentGroup));
+                    groups.Add(currentGroup);
                     currentGroup = new List<float> { x };
                 }
                 last = x;
             }
-
             groups.Add(currentGroup);
             return groups;
         }
 
-        public static Dictionary<float, int> make_cluster_dict(List<float> values, float tolerance)
+        internal static Rect ObjectsToBbox(IEnumerable<object> objects)
         {
-            var clusters = cluster_list(values.Distinct().ToList(), tolerance);
-            var clusterDict = new Dictionary<float, int>();
-
-            var index = 0;
-            foreach (var cluster in clusters)
+            var rects = new List<Rect>();
+            foreach (var obj in objects)
             {
-                foreach (var value in cluster)
+                if (obj is CharDict charDict)
+                {
+                    rects.Add(new Rect(charDict.x0, charDict.top, charDict.x1, charDict.bottom));
+                }
+                else if (obj is Dictionary<string, object> dict)
                 {
-                    clusterDict[value] = index;
+                    if (dict.ContainsKey("x0") && dict.ContainsKey("top") && dict.ContainsKey("x1") && dict.ContainsKey("bottom"))
+                    {
+                        rects.Add(new Rect(
+                            Convert.ToSingle(dict["x0"]),
+                            Convert.ToSingle(dict["top"]),
+                            Convert.ToSingle(dict["x1"]),
+                            Convert.ToSingle(dict["bottom"])
+                        ));
+                    }
                 }
-                index++;
             }
 
-            return clusterDict;
-        }
+            if (rects.Count == 0)
+                return new Rect(0, 0, 0, 0);
 
-        public static List<List<T>> cluster_objects<T>(List<T> xs, Func<T, float> keyFn, float tolerance)
-        {
-            var values = xs.Select(keyFn).ToList();
-            var clusterDict = make_cluster_dict(values, tolerance);
+            return new Rect(
+                rects.Min(r => r.X0),
+                rects.Min(r => r.Y0),
+                rects.Max(r => r.X1),
+                rects.Max(r => r.Y1)
+            );
+        }
+    }
 
-            var clusterTuples = xs.Select(x => new { Object = x, ClusterId = clusterDict[keyFn(x)] })
-                                  .OrderBy(t => t.ClusterId)
-                                  .ToList();
+    // TextMap class - maps each unicode character to a char object
+    internal class TextMap
+    {
+        public List<Tuple<string, CharDict>> tuples { get; set; }
+        public string as_string { get; set; }
 
-            var grouped = clusterTuples.GroupBy(t => t.ClusterId)
-                                       .Select(g => g.Select(t => t.Object).ToList())
-                                       .ToList();
-            return grouped;
+        public TextMap(List<Tuple<string, CharDict>> tuples = null)
+        {
+            this.tuples = tuples ?? new List<Tuple<string, CharDict>>();
+            this.as_string = string.Join("", this.tuples.Select(t => t.Item1));
         }
 
-        public static Edge move_object(Edge obj, string axis, float value)
+        public Dictionary<string, object> MatchToDict(
+            Match m,
+            int mainGroup = 0,
+            bool returnGroups = true,
+            bool returnChars = true)
         {
-            // Ensure the axis is valid
-            if (axis != "h" && axis != "v")
-            {
-                throw new ArgumentException("Axis must be 'h' or 'v'", nameof(axis));
-            }
+            var subset = tuples.Skip(m.Groups[mainGroup].Index).Take(m.Groups[mainGroup].Length).ToList();
+            var chars = subset.Where(t => t.Item2 != null).Select(t => t.Item2).ToList();
+            var bbox = TableHelpers.ObjectsToBbox(chars);
 
-            // Prepare the new property values
-            var newProperties = new List<(string, float)>();
+            var result = new Dictionary<string, object>
+            {
+                { "text", m.Groups[mainGroup].Value },
+                { "x0", bbox.X0 },
+                { "top", bbox.Y0 },
+                { "x1", bbox.X1 },
+                { "bottom", bbox.Y1 }
+            };
 
-            if (axis == "h")
+            if (returnGroups)
             {
-                newProperties.Add(("x0", obj.x0 + value));
-                newProperties.Add(("x1", obj.x1 + value));
+                var groups = new List<string>();
+                for (int i = 1; i < m.Groups.Count; i++)
+                    groups.Add(m.Groups[i].Value);
+                result["groups"] = groups;
             }
 
-            if (axis == "v")
-            {
-                newProperties.Add(("top", obj.top + value));
-                newProperties.Add(("bottom", obj.bottom + value));
+            if (returnChars)
+                result["chars"] = chars;
+
+            return result;
+        }
+    }
+
+    // WordMap class - maps words to chars
+    internal class WordMap
+    {
+        public List<Tuple<Dictionary<string, object>, List<CharDict>>> tuples { get; set; }
+
+        public WordMap(List<Tuple<Dictionary<string, object>, List<CharDict>>> tuples)
+        {
+            this.tuples = tuples;
+        }
+
+        public TextMap ToTextmap(
+            bool layout = false,
+            float layoutWidth = 0,
+            float layoutHeight = 0,
+            int layoutWidthChars = 0,
+            int layoutHeightChars = 0,
+            float xDensity = TableFlags.TABLE_DEFAULT_X_DENSITY,
+            float yDensity = TableFlags.TABLE_DEFAULT_Y_DENSITY,
+            float xShift = 0,
+            float yShift = 0,
+            float yTolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE,
+            bool useTextFlow = false,
+            bool presorted = false,
+            bool expandLigatures = true)
+        {
+            var textmap = new List<Tuple<string, CharDict>>();
 
-                // Handle optional properties if they exist
-                if (obj.doctop >= 0f)
+            if (tuples.Count == 0)
+                return new TextMap(textmap);
+
+            var expansions = expandLigatures ? TableConstants.LIGATURES : new Dictionary<string, string>();
+
+            int layoutWidthCharsFinal = layoutWidthChars;
+            int layoutHeightCharsFinal = layoutHeightChars;
+
+            if (layout)
+            {
+                if (layoutWidthChars > 0)
                 {
-                    newProperties.Add(("doctop", obj.doctop + value));
+                    if (layoutWidth > 0)
+                        throw new ArgumentException("`layout_width` and `layout_width_chars` cannot both be set.");
                 }
-
-                if (obj.y0 >= 0f)
+                else
                 {
-                    newProperties.Add(("y0", obj.y0 - value));
-                    newProperties.Add(("y1", obj.y1 - value));
+                    layoutWidthCharsFinal = (int)Math.Round(layoutWidth / xDensity);
                 }
-            }
 
-            // Create a new MyObject with the updated values
-            var newObj = new Edge();
-            newObj = obj;
-
-            // Update the properties
-            foreach (var prop in newProperties)
-            {
-                // You will need to use reflection or manual assignment for the dynamic property names
-                switch (prop.Item1)
+                if (layoutHeightChars > 0)
+                {
+                    if (layoutHeight > 0)
+                        throw new ArgumentException("`layout_height` and `layout_height_chars` cannot both be set.");
+                }
+                else
                 {
-                    case "x0":
-                        newObj.x0 = prop.Item2;
-                        break;
-                    case "x1":
-                        newObj.x1 = prop.Item2;
-                        break;
-                    case "top":
-                        newObj.top = prop.Item2;
-                        break;
-                    case "bottom":
-                        newObj.bottom = prop.Item2;
-                        break;
-                    case "doctop":
-                        newObj.doctop = prop.Item2;
-                        break;
-                    case "y0":
-                        newObj.y0 = prop.Item2;
-                        break;
-                    case "y1":
-                        newObj.y1 = prop.Item2;
-                        break;
+                    layoutHeightCharsFinal = (int)Math.Round(layoutHeight / yDensity);
                 }
             }
 
-            return newObj;
-        }
+            var blankLine = layout ? Enumerable.Range(0, layoutWidthCharsFinal)
+                .Select(_ => Tuple.Create(" ", (CharDict)null)).ToList() : new List<Tuple<string, CharDict>>();
 
+            int numNewlines = 0;
 
-        public static List<Edge> snap_objects(List<Edge> objs, string attr, float tolerance)
-        {
-            // Mapping the attribute to the axis (horizontal or vertical)
-            //string axis = attr switch
-            //{
-            //    "x0" => "h",
-            //    "x1" => "h",
-            //    "top" => "v",
-            //    "bottom" => "v",
-            //    _ => throw new ArgumentException("Invalid attribute", nameof(attr))
-            //};
-            string axis;
-            switch (attr)
-            {
-                case "x0":
-                case "x1":
-                    axis = "h";
-                    break;
-                case "top":
-                case "bottom":
-                    axis = "v";
-                    break;
-                default:
-                    throw new ArgumentException("Invalid attribute", nameof(attr));
-            }
+            var wordsSortedDoctop = presorted || useTextFlow
+                ? tuples
+                : tuples.OrderBy(t => Convert.ToSingle(t.Item1["doctop"])).ToList();
 
-            List<List<Edge>> clusters = new List<List<Edge>>();
-            List<float> avgs = new List<float>();
-            List<List<Edge>> snappedClusters = new List<List<Edge>>();
-            switch (attr)
-            {
-                case "x0":
-                    clusters = cluster_objects(objs, obj => obj.x0, tolerance);
-                    avgs = clusters.Select(cluster => cluster.Average(obj => obj.x0)).ToList();
-                    snappedClusters = clusters.Select((cluster, idx) =>
-                        cluster.Select(obj => move_object(obj, axis, avgs[idx] - (float)obj.x0)).ToList()).ToList();
-                    break;
-                case "x1":
-                    clusters = cluster_objects(objs, obj => obj.x1, tolerance);
-                    avgs = clusters.Select(cluster => cluster.Average(obj => obj.x1)).ToList();
-                    snappedClusters = clusters.Select((cluster, idx) =>
-                        cluster.Select(obj => move_object(obj, axis, avgs[idx] - (float)obj.x1)).ToList()).ToList();
-                    break;
-                case "top":
-                    clusters = cluster_objects(objs, obj => obj.top, tolerance);
-                    avgs = clusters.Select(cluster => cluster.Average(obj => obj.top)).ToList();
-                    snappedClusters = clusters.Select((cluster, idx) =>
-                        cluster.Select(obj => move_object(obj, axis, avgs[idx] - (float)obj.top)).ToList()).ToList();
-                    break;
-                case "bottom":
-                    clusters = cluster_objects(objs, obj => obj.bottom, tolerance);
-                    avgs = clusters.Select(cluster => cluster.Average(obj => obj.bottom)).ToList();
-                    snappedClusters = clusters.Select((cluster, idx) =>
-                        cluster.Select(obj => move_object(obj, axis, avgs[idx] - (float)obj.bottom)).ToList()).ToList();
-                    break;
-                default:
-                    return null;
-            }
+            if (wordsSortedDoctop.Count == 0)
+                return new TextMap(textmap);
 
-            // Flatten the list of snapped clusters and return
-            return snappedClusters.SelectMany(cluster => cluster).ToList();
-        }
+            var firstWord = wordsSortedDoctop[0].Item1;
+            float doctopStart = Convert.ToSingle(firstWord["doctop"]) - Convert.ToSingle(firstWord["top"]);
 
-        // Given a list of edges, snap any within `tolerance` pixels of one another
-        // to their positional average.
-        public static List<Edge> snap_edges(
-            List<Edge> edges,
-            float xTolerance = 1.0f,
-            float yTolerance = 1.0f)
-        {
-            // Group edges by orientation
-            var byOrientation = new Dictionary<string, List<Edge>>()
-            {
-                { "v", new List<Edge>() },
-                { "h", new List<Edge>() }
-            };
+            var clusters = TableHelpers.ClusterObjects(wordsSortedDoctop, t => Convert.ToSingle(t.Item1["doctop"]), yTolerance);
 
-            foreach (var edge in edges)
+            for (int i = 0; i < clusters.Count; i++)
             {
-                byOrientation[edge.orientation].Add(edge);
-            }
+                var ws = clusters[i];
+                float yDist = layout
+                    ? (Convert.ToSingle(ws[0].Item1["doctop"]) - (doctopStart + yShift)) / yDensity
+                    : 0;
 
-            // Snap vertical and horizontal edges separately
-            List<Edge> snappedV = snap_objects(byOrientation["v"], "x0", xTolerance);
-            List<Edge> snappedH = snap_objects(byOrientation["h"], "top", yTolerance);
+                int numNewlinesPrepend = Math.Max(
+                    i > 0 ? 1 : 0,
+                    (int)Math.Round(yDist) - numNewlines
+                );
 
-            // Combine and return snapped objects
-            return snappedV.Concat(snappedH).ToList();
-        }
+                for (int j = 0; j < numNewlinesPrepend; j++)
+                {
+                    if (textmap.Count == 0 || textmap[textmap.Count - 1].Item1 == "\n")
+                        textmap.AddRange(blankLine);
+                    textmap.Add(Tuple.Create("\n", (CharDict)null));
+                }
 
-        // Resize the object based on the given key and value
-        public static Edge resize_object(Edge obj, string key, float value)
-        {
-            if (!new[] { "x0", "x1", "top", "bottom" }.Contains(key))
-            {
-                throw new ArgumentException("Invalid key. Must be one of 'x0', 'x1', 'top', 'bottom'.", nameof(key));
-            }
+                numNewlines += numNewlinesPrepend;
 
-            Edge newObj = new Edge();
-            newObj = obj;
+                int lineLen = 0;
+                var lineWordsSortedX0 = presorted || useTextFlow
+                    ? ws
+                    : ws.OrderBy(t => Convert.ToSingle(t.Item1["x0"])).ToList();
 
-            if (key == "x0")
-            {
-                if (value > obj.x1) throw new ArgumentException("x0 must be less than or equal to x1.");
-                newObj.x0 = value;
-                newObj.width = obj.x1 - value;
-            }
-            else if (key == "x1")
-            {
-                if (value < obj.x0) throw new ArgumentException("x1 must be greater than or equal to x0.");
-                newObj.x1 = value;
-                newObj.width = value - obj.x0;
-            }
-            else if (key == "top")
-            {
-                if (value > obj.bottom) throw new ArgumentException("top must be less than or equal to bottom.");
-                float oldValue = obj.top;
-                float diff = value - oldValue;
-                newObj.top = value;
-                newObj.doctop = obj.doctop + diff;
-                newObj.height = obj.height - diff;
-                if (obj.y1 >= 0f) 
-                    newObj.y1 = obj.y1 - diff;
-            }
-            else if (key == "bottom")
-            {
-                if (value < obj.top) throw new ArgumentException("bottom must be greater than or equal to top.");
-                float oldValue = obj.bottom;
-                float diff = value - oldValue;
-                newObj.bottom = value;
-                newObj.height = obj.height + diff;
-                if (obj.y0 >= 0f) 
-                    newObj.y0 = obj.y0 - diff;
-            }
-
-            // Return a new object with the updated properties
-            return newObj;
-        }
-
-        // Given a list of edges along the same infinite line, join those that
-        // are within `tolerance` pixels of one another.
-        public static List<Edge> join_edge_group(List<Edge> edges, string orientation, float tolerance = TableFlags.TABLE_DEFAULT_JOIN_TOLERANCE)
-        {
-            List<Edge> joined = new List<Edge>();
-            if (orientation == "h")
-            {
-                // Sort edges by the min property
-                var sortedEdges = edges.OrderBy(e => e.x0).ToList();
-                joined = new List<Edge> { sortedEdges[0] };
-
-                foreach (var e in sortedEdges.Skip(1))
+                foreach (Tuple<Dictionary<string, object>, List<CharDict>> tuple in lineWordsSortedX0)
                 {
-                    var last = joined.Last();
-                    if (e.x0 <= last.x1 + tolerance)
+                    var word = tuple.Item1;
+                    var chars = tuple.Item2;
+                    float xDist = layout ? (Convert.ToSingle(word["x0"]) - xShift) / xDensity : 0;
+                    int numSpacesPrepend = Math.Max(Math.Min(1, lineLen), (int)Math.Round(xDist) - lineLen);
+                    
+                    for (int k = 0; k < numSpacesPrepend; k++)
+                        textmap.Add(Tuple.Create(" ", (CharDict)null));
+                    lineLen += numSpacesPrepend;
+
+                    foreach (var c in chars)
                     {
-                        if (e.x1 > last.x1)
+                        string letters = expansions.ContainsKey(c.text) ? expansions[c.text] : c.text;
+                        foreach (char letter in letters)
                         {
-                            // Extend current edge to new extremity
-                            joined[joined.Count - 1] = resize_object(last, "x1", e.x1);
+                            textmap.Add(Tuple.Create(letter.ToString(), c));
+                            lineLen++;
                         }
                     }
-                    else
-                    {
-                        // Edge is separate from the previous edge
-                        joined.Add(e);
-                    }
                 }
-            }
-            else if (orientation == "v")
-            {
-                // Sort edges by the min property
-                var sortedEdges = edges.OrderBy(e => e.top).ToList();
-                joined = new List<Edge> { sortedEdges[0] };
 
-                foreach (var e in sortedEdges.Skip(1))
+                if (layout)
                 {
-                    var last = joined.Last();
-                    if (e.top <= last.bottom + tolerance)
-                    {
-                        if (e.bottom > last.bottom)
-                        {
-                            // Extend current edge to new extremity
-                            joined[joined.Count - 1] = resize_object(last, "bottom", e.bottom);
-                        }
-                    }
-                    else
-                    {
-                        // Edge is separate from the previous edge
-                        joined.Add(e);
-                    }
+                    for (int k = 0; k < layoutWidthCharsFinal - lineLen; k++)
+                        textmap.Add(Tuple.Create(" ", (CharDict)null));
                 }
             }
-            else
+
+            if (layout)
             {
-                throw new ArgumentException("Orientation must be 'v' or 'h'", nameof(orientation));
+                int numNewlinesAppend = layoutHeightCharsFinal - (numNewlines + 1);
+                for (int i = 0; i < numNewlinesAppend; i++)
+                {
+                    if (i > 0)
+                        textmap.AddRange(blankLine);
+                    textmap.Add(Tuple.Create("\n", (CharDict)null));
+                }
+
+                if (textmap.Count > 0 && textmap[textmap.Count - 1].Item1 == "\n")
+                    textmap.RemoveAt(textmap.Count - 1);
             }
 
-            return joined;
+            return new TextMap(textmap);
         }
+    }
 
-        // Using the `snap_edges` and `join_edge_group` methods above,
-        // merge a list of edges into a more "seamless" list.
-        public static List<Edge> merge_edges(
-            List<Edge> edges,
-            float snap_x_tolerance,
-            float snap_y_tolerance,
-            float join_x_tolerance,
-            float join_y_tolerance)
-        {
-            (string, float) get_group(Edge edge)
-            {
-                if (edge.orientation == "h")
-                    return ("h", edge.top);
-                else
-                    return ("v", edge.x0);
-            }
+    // WordExtractor class
+    internal class WordExtractor
+    {
+        public float x_tolerance { get; set; }
+        public float y_tolerance { get; set; }
+        public bool keep_blank_chars { get; set; }
+        public bool use_text_flow { get; set; }
+        public bool horizontal_ltr { get; set; }
+        public bool vertical_ttb { get; set; }
+        public List<string> extra_attrs { get; set; }
+        public string split_at_punctuation { get; set; }
+        public Dictionary<string, string> expansions { get; set; }
 
-            // Snap edges if tolerance values are greater than 0
-            if (snap_x_tolerance > 0 || snap_y_tolerance > 0)
-            {
-                edges = snap_edges(edges, snap_x_tolerance, snap_y_tolerance);
-            }
-/*
-            // Group edges by orientation
-            var edgeGroups = edges
-                .OrderBy(e => e.orientation == "h" ? e.top : e.x0)
-                .GroupBy(e => e.orientation == "h" ? "h" : "v");
-*/
-            // Sort edges by group (orientation + position)
-            var _sorted = edges.OrderBy(e => e.orientation)
-                                   .ThenBy(e => e.orientation == "h" ? e.top : e.x0)
-                                   .ToList();
-
-            // Group edges by the same group key
-            var edgeGroups = _sorted
-                .GroupBy(get_group)
-                .ToList();
+        public WordExtractor(
+            float x_tolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE,
+            float y_tolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE,
+            bool keep_blank_chars = false,
+            bool use_text_flow = false,
+            bool horizontal_ltr = true,
+            bool vertical_ttb = false,
+            List<string> extra_attrs = null,
+            bool split_at_punctuation = false,
+            bool expand_ligatures = true)
+        {
+            this.x_tolerance = x_tolerance;
+            this.y_tolerance = y_tolerance;
+            this.keep_blank_chars = keep_blank_chars;
+            this.use_text_flow = use_text_flow;
+            this.horizontal_ltr = horizontal_ltr;
+            this.vertical_ttb = vertical_ttb;
+            this.extra_attrs = extra_attrs ?? new List<string>();
+            this.split_at_punctuation = split_at_punctuation
+                ? "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
+                : "";
+            this.expansions = expand_ligatures ? TableConstants.LIGATURES : new Dictionary<string, string>();
+        }
+
+        public Dictionary<string, object> MergeChars(List<CharDict> orderedChars)
+        {
+            var bbox = TableHelpers.ObjectsToBbox(orderedChars);
+            float doctopAdj = orderedChars[0].doctop - orderedChars[0].top;
+            bool upright = orderedChars[0].upright;
+            int direction = (upright ? horizontal_ltr : vertical_ttb) ? 1 : -1;
 
-            // Join edges by their groups
-            var joinedEdges = new List<Edge>();
-            foreach (var group in edgeGroups)
+            var matrix = orderedChars[0].matrix;
+            int rotation = 0;
+
+            if (!upright && matrix.Item2 < 0)
             {
-                float tolerance = group.Key.Item1 == "h" ? join_x_tolerance : join_y_tolerance;
-                joinedEdges.AddRange(join_edge_group(group.ToList(), group.Key.Item1, tolerance));
+                orderedChars = orderedChars.AsEnumerable().Reverse().ToList();
+                rotation = 270;
             }
 
-            return joinedEdges;
-        }
+            if (matrix.Item1 < 0 && matrix.Item4 < 0)
+                rotation = 180;
+            else if (matrix.Item2 > 0)
+                rotation = 90;
 
-        // Return the rectangle(i.e a dict with keys "x0", "top", "x1",
-        // "bottom") for an object.
-        public static Dictionary<string, float> bbox_to_rect(BBox bbox)
-        {
-            var rect = new Dictionary<string, float>
-            {
-                { "x0", bbox.x0 },
-                { "top", bbox.top },
-                { "x1", bbox.x1 },
-                { "bottom", bbox.bottom }
+            var word = new Dictionary<string, object>
+            {
+                { "text", string.Join("", orderedChars.Select(c => expansions.ContainsKey(c.text) ? expansions[c.text] : c.text)) },
+                { "x0", bbox.X0 },
+                { "x1", bbox.X1 },
+                { "top", bbox.Y0 },
+                { "doctop", bbox.Y0 + doctopAdj },
+                { "bottom", bbox.Y1 },
+                { "upright", upright },
+                { "direction", direction },
+                { "rotation", rotation }
             };
 
-            return rect;
-        }
+            foreach (var key in extra_attrs)
+            {
+                if (orderedChars.Count > 0)
+                {
+                    var prop = typeof(CharDict).GetProperty(key);
+                    if (prop != null)
+                        word[key] = prop.GetValue(orderedChars[0]);
+                }
+            }
 
-        // Given an iterable of objects, return the smallest rectangle(i.e.a
-        // dict with "x0", "top", "x1", and "bottom" keys) that contains them
-        // all.
-        public static Dictionary<string, float> objects_to_rect(IEnumerable<object> objects)
-        {
-            BBox bbox = objects_to_bbox(objects);
-            return bbox_to_rect(bbox);
+            return word;
         }
 
-        // Given an iterable of bounding boxes, return the smallest bounding box
-        // that contains them all.
-        public static BBox merge_bboxes(List<BBox> bboxes)
+        public bool CharBeginsNewWord(CharDict prevChar, CharDict currChar)
         {
-            if (bboxes.Count > 0)
+            if (currChar.upright)
             {
-                var x0 = bboxes.Select(b => b.x0).Min();
-                var top = bboxes.Select(b => b.top).Min();
-                var x1 = bboxes.Select(b => b.x1).Max();
-                var bottom = bboxes.Select(b => b.bottom).Max();
+                float x = x_tolerance;
+                float y = y_tolerance;
+                float ay = prevChar.top;
+                float cy = currChar.top;
+                float ax, bx, cx;
+
+                if (horizontal_ltr)
+                {
+                    ax = prevChar.x0;
+                    bx = prevChar.x1;
+                    cx = currChar.x0;
+                }
+                else
+                {
+                    ax = -prevChar.x1;
+                    bx = -prevChar.x0;
+                    cx = -currChar.x1;
+                }
 
-                return new BBox(x0, top, x1, bottom);
+                return (cx < ax) || (cx > bx + x) || (cy > ay + y);
             }
             else
             {
-                return new BBox(0, 0, 0, 0);
+                float x = y_tolerance;
+                float y = x_tolerance;
+                float ay = prevChar.x0;
+                float cy = currChar.x0;
+                float ax, bx, cx;
+
+                if (vertical_ttb)
+                {
+                    ax = prevChar.top;
+                    bx = prevChar.bottom;
+                    cx = currChar.top;
+                }
+                else
+                {
+                    ax = -prevChar.bottom;
+                    bx = -prevChar.top;
+                    cx = -currChar.bottom;
+                }
+
+                return (cx < ax) || (cx > bx + x) || (cy > ay + y);
             }
         }
 
-        // Given an iterable of objects, return the smallest bounding box that
-        // contains them all.
-        public static BBox objects_to_bbox(IEnumerable<object> objects)
+        public IEnumerable<List<CharDict>> IterCharsToWords(IEnumerable<CharDict> orderedChars)
         {
-            List<BBox> bboxes = new List<BBox>();
-            foreach (var obj in objects)
+            var currentWord = new List<CharDict>();
+
+            foreach (var char_ in orderedChars)
             {
-                if (obj is Character)
+                string text = char_.text;
+
+                if (!keep_blank_chars && string.IsNullOrWhiteSpace(text))
+                {
+                    if (currentWord.Count > 0)
+                    {
+                        yield return currentWord;
+                        currentWord = new List<CharDict>();
+                    }
+                }
+                else if (split_at_punctuation.Contains(text))
+                {
+                    currentWord.Add(char_);
+                    yield return currentWord;
+                    currentWord = new List<CharDict>();
+                }
+                else if (currentWord.Count > 0 && CharBeginsNewWord(currentWord[currentWord.Count - 1], char_))
                 {
-                    Character ch = obj as Character;
-                    bboxes.Add(new BBox(ch.x0, ch.top, ch.x1, ch.bottom));
+                    yield return currentWord;
+                    currentWord = new List<CharDict> { char_ };
                 }
                 else
                 {
-                    bboxes.Add(obj as BBox);
+                    currentWord.Add(char_);
                 }
             }
-            return merge_bboxes(bboxes);
+
+            if (currentWord.Count > 0)
+                yield return currentWord;
         }
 
-        // Find(imaginary) horizontal lines that connect the tops
-        // of at least `word_threshold` words.
-        public static List<Edge> words_to_edges_h(List<Character> words, int wordThreshold = (int)TableFlags.TABLE_DEFAULT_MIN_WORDS_HORIZONTAL)
+        public IEnumerable<CharDict> IterSortChars(IEnumerable<CharDict> chars)
         {
-            // Cluster the words by 'top' value (simulating `cluster_objects`)
-            var byTop = cluster_objects(words, obj => obj.top, 1);
+            var charsList = chars.ToList();
+            var uprightClusters = TableHelpers.ClusterObjects(charsList, c => c.upright ? -1 : 0, 0);
 
-            // Filter clusters by the word threshold
-            List<List<Character>> largeClusters = byTop.Where(cluster => cluster.Count >= wordThreshold).ToList();
-
-            // Convert clusters to bounding rectangles
-            var rects = largeClusters.Select(c => objects_to_bbox(c)).ToList();
-
-            if (rects.Count == 0)
-                return new List<Edge>();
-
-            // Find min and max x0 and x1 values
-            float minX0 = rects.Min(r => r.x0);
-            float maxX1 = rects.Max(r => r.x1);
+            foreach (var uprightCluster in uprightClusters)
+            {
+                bool upright = uprightCluster[0].upright;
+                string clusterKey = upright ? "doctop" : "x0";
 
-            List<Edge> edges = new List<Edge>();
+                var subclusters = TableHelpers.ClusterObjects<CharDict>(uprightCluster, c => GetCharValue(c, clusterKey), y_tolerance);
 
-            foreach (var r in rects)
-            {
-                // Add the 'top' edge for each detected row
-                edges.Add(new Edge
+                foreach (var sc in subclusters)
                 {
-                    x0 = minX0,
-                    x1 = maxX1,
-                    top = r.top,
-                    bottom = r.top,
-                    width = maxX1 - minX0,
-                    orientation = "h"
-                });
+                    string sortKey = upright ? "x0" : "doctop";
+                    var toYield = sc.OrderBy(c => GetCharValue(c, sortKey)).ToList();
 
-                // Add the 'bottom' edge for each detected row (catches last row)
-                edges.Add(new Edge
-                {
-                    x0 = minX0,
-                    x1 = maxX1,
-                    top = r.bottom,
-                    bottom = r.bottom,
-                    width = maxX1 - minX0,
-                    orientation = "h"
-                });
-            }
+                    if (!(upright ? horizontal_ltr : vertical_ttb))
+                        toYield.Reverse();
 
-            return edges;
+                    foreach (var c in toYield)
+                        yield return c;
+                }
+            }
         }
 
-        public static BBox get_bbox_overlap(BBox a, BBox b)
+        private float GetCharValue(CharDict c, string key)
         {
-            float oLeft = Math.Max(a.x0, b.x0);
-            float oRight = Math.Min(a.x1, b.x1);
-            float oBottom = Math.Min(a.bottom, b.bottom);
-            float oTop = Math.Max(a.top, b.top);
-
-            float oWidth = oRight - oLeft;
-            float oHeight = oBottom - oTop;
-
-            if (oHeight >= 0 && oWidth >= 0 && oHeight + oWidth > 0)
+            switch (key)
             {
-                return new BBox(oLeft, oTop, oRight, oBottom);
+                case "x0":
+                    return c.x0;
+                case "doctop":
+                    return c.doctop;
+                case "top":
+                    return c.top;
+                default:
+                    return 0;
             }
-            return null;
         }
 
-        // Find(imaginary) vertical lines that connect the left, right, or
-        // center of at least `word_threshold` words.
-        public static List<Edge> words_to_edges_v(List<Character> words, int wordThreshold = (int)TableFlags.TABLE_DEFAULT_MIN_WORDS_VERTICAL)
+        public IEnumerable<Tuple<Dictionary<string, object>, List<CharDict>>> IterExtractTuples(IEnumerable<CharDict> chars)
         {
-            // Find words that share the same left, right, or centerpoints
-            var byX0 = cluster_objects(words, w => w.x0, 1);
-            var byX1 = cluster_objects(words, w => w.x1, 1);
-            var byCenter = cluster_objects(words, w => (w.x0 + w.x1) / 2, 1);
-
-            var clusters = byX0.Concat(byX1).Concat(byCenter).ToList();
-
-            // Find the points that align with the most words
-            var sortedClusters = clusters.OrderByDescending(c => c.Count).ToList();
-            var largeClusters = sortedClusters.Where(c => c.Count >= wordThreshold).ToList();
+            var orderedChars = use_text_flow ? chars : IterSortChars(chars);
+            var groupedChars = orderedChars.GroupBy(c => new { c.upright });
 
-            // For each of those points, find the bboxes fitting all matching words
-            var bboxes = largeClusters.Select(c => objects_to_bbox(c)).ToList();
-
-            // Iterate through those bboxes, condensing overlapping bboxes
-            var condensedBboxes = new List<BBox>();
-            foreach (var bbox in bboxes)
+            foreach (var group in groupedChars)
             {
-                bool overlap = condensedBboxes.Any(existingBbox => get_bbox_overlap(bbox, existingBbox) != null);
-                if (!overlap)
+                foreach (var wordChars in IterCharsToWords(group))
                 {
-                    condensedBboxes.Add(bbox);
+                    yield return Tuple.Create(MergeChars(wordChars), wordChars);
                 }
             }
+        }
 
-            if (condensedBboxes.Count == 0)
-            {
-                return new List<Edge>();
-            }
-
-            var condensedRects = condensedBboxes.Select(b => bbox_to_rect(b)).ToList();
+        public WordMap ExtractWordmap(IEnumerable<CharDict> chars)
+        {
+            return new WordMap(IterExtractTuples(chars).ToList());
+        }
 
-            // Sort rectangles by x0.
-            var sortedRects = condensedRects.OrderBy(r => r["x0"]).ToList();
+        public List<Dictionary<string, object>> ExtractWords(IEnumerable<CharDict> chars)
+        {
+            return IterExtractTuples(chars).Select(t => t.Item1).ToList();
+        }
+    }
 
-            float maxX1 = sortedRects.Max(r => r["x1"]);
-            float minTop = sortedRects.Min(r => r["top"]);
-            float maxBottom = sortedRects.Max(r => r["bottom"]);
+    // Helper functions for text extraction
+    internal static class TextExtractionHelpers
+    {
+        internal static List<Dictionary<string, object>> ExtractWords(List<CharDict> chars, Dictionary<string, object> kwargs = null)
+        {
+            if (kwargs == null) kwargs = new Dictionary<string, object>();
+            var extractor = new WordExtractor(
+                x_tolerance: kwargs.ContainsKey("x_tolerance") ? Convert.ToSingle(kwargs["x_tolerance"]) : TableFlags.TABLE_DEFAULT_X_TOLERANCE,
+                y_tolerance: kwargs.ContainsKey("y_tolerance") ? Convert.ToSingle(kwargs["y_tolerance"]) : TableFlags.TABLE_DEFAULT_Y_TOLERANCE,
+                keep_blank_chars: kwargs.ContainsKey("keep_blank_chars") && (bool)kwargs["keep_blank_chars"],
+                use_text_flow: kwargs.ContainsKey("use_text_flow") && (bool)kwargs["use_text_flow"],
+                horizontal_ltr: !kwargs.ContainsKey("horizontal_ltr") || (bool)kwargs["horizontal_ltr"],
+                vertical_ttb: kwargs.ContainsKey("vertical_ttb") && (bool)kwargs["vertical_ttb"],
+                split_at_punctuation: kwargs.ContainsKey("split_at_punctuation") && (bool)kwargs["split_at_punctuation"],
+                expand_ligatures: !kwargs.ContainsKey("expand_ligatures") || (bool)kwargs["expand_ligatures"]
+            );
+            return extractor.ExtractWords(chars);
+        }
 
-            // Create edges based on the rectangles.
-            var edges = sortedRects.Select(b => new Edge
-            {
-                x0 = b["x0"],
-                x1 = b["x0"],
-                top = minTop,
-                bottom = maxBottom,
-                height = maxBottom - minTop,
-                orientation = "v"
-            }).ToList();
+        internal static TextMap CharsToTextmap(List<CharDict> chars, Dictionary<string, object> kwargs = null)
+        {
+            if (kwargs == null) kwargs = new Dictionary<string, object>();
+            kwargs["presorted"] = true;
 
-            edges.Add(new Edge
-            {
-                x0 = maxX1,
-                x1 = maxX1,
-                top = minTop,
-                bottom = maxBottom,
-                height = maxBottom - minTop,
-                orientation = "v"
-            });
+            var extractor = new WordExtractor(
+                x_tolerance: kwargs.ContainsKey("x_tolerance") ? Convert.ToSingle(kwargs["x_tolerance"]) : TableFlags.TABLE_DEFAULT_X_TOLERANCE,
+                y_tolerance: kwargs.ContainsKey("y_tolerance") ? Convert.ToSingle(kwargs["y_tolerance"]) : TableFlags.TABLE_DEFAULT_Y_TOLERANCE,
+                keep_blank_chars: kwargs.ContainsKey("keep_blank_chars") && (bool)kwargs["keep_blank_chars"],
+                use_text_flow: kwargs.ContainsKey("use_text_flow") && (bool)kwargs["use_text_flow"],
+                expand_ligatures: !kwargs.ContainsKey("expand_ligatures") || (bool)kwargs["expand_ligatures"]
+            );
 
-            return edges;
+            var wordmap = extractor.ExtractWordmap(chars);
+            return wordmap.ToTextmap(
+                layout: kwargs.ContainsKey("layout") && (bool)kwargs["layout"],
+                layoutWidth: kwargs.ContainsKey("layout_width") ? Convert.ToSingle(kwargs["layout_width"]) : 0,
+                layoutHeight: kwargs.ContainsKey("layout_height") ? Convert.ToSingle(kwargs["layout_height"]) : 0,
+                layoutWidthChars: kwargs.ContainsKey("layout_width_chars") ? Convert.ToInt32(kwargs["layout_width_chars"]) : 0,
+                layoutHeightChars: kwargs.ContainsKey("layout_height_chars") ? Convert.ToInt32(kwargs["layout_height_chars"]) : 0,
+                xDensity: kwargs.ContainsKey("x_density") ? Convert.ToSingle(kwargs["x_density"]) : TableFlags.TABLE_DEFAULT_X_DENSITY,
+                yDensity: kwargs.ContainsKey("y_density") ? Convert.ToSingle(kwargs["y_density"]) : TableFlags.TABLE_DEFAULT_Y_DENSITY,
+                xShift: kwargs.ContainsKey("x_shift") ? Convert.ToSingle(kwargs["x_shift"]) : 0,
+                yShift: kwargs.ContainsKey("y_shift") ? Convert.ToSingle(kwargs["y_shift"]) : 0,
+                yTolerance: kwargs.ContainsKey("y_tolerance") ? Convert.ToSingle(kwargs["y_tolerance"]) : TableFlags.TABLE_DEFAULT_Y_TOLERANCE,
+                useTextFlow: kwargs.ContainsKey("use_text_flow") && (bool)kwargs["use_text_flow"],
+                presorted: kwargs.ContainsKey("presorted") && (bool)kwargs["presorted"],
+                expandLigatures: !kwargs.ContainsKey("expand_ligatures") || (bool)kwargs["expand_ligatures"]
+            );
         }
 
-        // Given a list of edges, return the points at which they intersect
-        // within `tolerance` pixels.
-        public class Intersection
+        internal static string ExtractText(List<CharDict> chars, Dictionary<string, object> kwargs = null)
         {
-            public float x0 { get; set; }
-            public float top { get; set; }
-            public float x1 { get; set; }
-            public float bottom { get; set; }
-            public List<Edge> VerticalEdges { get; set; }
-            public List<Edge> HorizontalEdges { get; set; }
+            if (kwargs == null) kwargs = new Dictionary<string, object>();
+            var charsList = TableHelpers.ToList<CharDict>(chars);
+            if (charsList.Count == 0)
+                return "";
 
-            public Intersection()
-            {
-                this.VerticalEdges = new List<Edge>();
-                this.HorizontalEdges = new List<Edge>();
-            }
-        }
+            if (kwargs.ContainsKey("layout") && (bool)kwargs["layout"])
+                return CharsToTextmap(charsList, kwargs).as_string;
 
-        public static Dictionary<Point, Intersection> edges_to_intersections(
-            List<Edge> edges, float x_tolerance = 1.0f, float y_tolerance = 1.0f)
-        {
-            var intersections = new Dictionary<Point, Intersection>();
+            float yTolerance = kwargs.ContainsKey("y_tolerance") ? Convert.ToSingle(kwargs["y_tolerance"]) : TableFlags.TABLE_DEFAULT_Y_TOLERANCE;
+            var extractor = new WordExtractor(
+                x_tolerance: kwargs.ContainsKey("x_tolerance") ? Convert.ToSingle(kwargs["x_tolerance"]) : TableFlags.TABLE_DEFAULT_X_TOLERANCE,
+                y_tolerance: yTolerance,
+                keep_blank_chars: kwargs.ContainsKey("keep_blank_chars") && (bool)kwargs["keep_blank_chars"],
+                use_text_flow: kwargs.ContainsKey("use_text_flow") && (bool)kwargs["use_text_flow"],
+                expand_ligatures: !kwargs.ContainsKey("expand_ligatures") || (bool)kwargs["expand_ligatures"]
+            );
 
-            // Separate vertical and horizontal edges
-            var vEdges = edges.Where(e => e.orientation == "v").ToList();
-            var hEdges = edges.Where(e => e.orientation == "h").ToList();
+            var words = extractor.ExtractWords(charsList);
+            if (words.Count == 0)
+                return "";
 
-            // Sort edges (vertical by X0 then Top, horizontal by Top then X0)
-            vEdges = vEdges.OrderBy(e => e.x0).ThenBy(e => e.top).ToList();
-            hEdges = hEdges.OrderBy(e => e.top).ThenBy(e => e.x0).ToList();
+            int rotation = words[0].ContainsKey("rotation") ? Convert.ToInt32(words[0]["rotation"]) : 0;
 
-            foreach (var v in vEdges)
+            if (rotation == 90)
             {
-                foreach (var h in hEdges)
+                words = words.OrderBy(w => Convert.ToSingle(w["x1"])).ThenByDescending(w => Convert.ToSingle(w["top"])).ToList();
+                return string.Join(" ", words.Select(w => w["text"].ToString()));
+            }
+            else if (rotation == 270)
+            {
+                words = words.OrderByDescending(w => Convert.ToSingle(w["x1"])).ThenBy(w => Convert.ToSingle(w["top"])).ToList();
+                return string.Join(" ", words.Select(w => w["text"].ToString()));
+            }
+            else
+            {
+                var lines = TableHelpers.ClusterObjects(words, w => Convert.ToSingle(w["doctop"]), yTolerance);
+                var result = string.Join("\n", lines.Select(line => string.Join(" ", line.Select(w => w["text"].ToString()))));
+                if (rotation == 180)
                 {
-                    // Check if the vertical and horizontal lines intersect within tolerance
-                    if (v.top <= h.top + y_tolerance && v.bottom >= h.top - y_tolerance &&
-                        v.x0 >= h.x0 - x_tolerance && v.x0 <= h.x1 + x_tolerance)
-                    {
-                        var vertex = new Point(v.x0, h.top);
-
-                        if (!intersections.ContainsKey(vertex))
-                        {
-                            intersections[vertex] = new Intersection();
-                        }
-
-                        intersections[vertex].VerticalEdges.Add(v);
-                        intersections[vertex].HorizontalEdges.Add(h);
-                    }
+                    var charArray = result.ToCharArray();
+                    Array.Reverse(charArray);
+                    return new string(charArray.Select(c => c == '\n' ? ' ' : c).ToArray());
                 }
+                return result;
             }
-
-            return intersections;
         }
 
-        // Return the bounding box for an object.
-        static BBox obj_to_bbox(Edge edge)
+        internal static string CollateLine(List<CharDict> lineChars, float tolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE)
         {
-            return new BBox(edge.x0, edge.top, edge.x1, edge.bottom);
+            var coll = new StringBuilder();
+            float? lastX1 = null;
+            foreach (var char_ in lineChars.OrderBy(c => c.x0))
+            {
+                if (lastX1.HasValue && char_.x0 > (lastX1.Value + tolerance))
+                    coll.Append(" ");
+                lastX1 = char_.x1;
+                coll.Append(char_.text);
+            }
+            return coll.ToString();
         }
 
-
-        // Given a list of points(`intersections`), return all rectangular "cells"
-        // that those points describe.
-        // `intersections` should be a dictionary with (x0, top) tuples as keys,
-        // and a list of edge objects as values.The edge objects should correspond
-        // to the edges that touch the intersection.
-        public static List<BBox> intersections_to_cells(Dictionary<Point, Intersection> intersections)
+        internal static List<CharDict> DedupeChars(List<CharDict> chars, float tolerance = 1)
         {
-            var points = intersections.Keys.OrderBy(p => p.X).ToList();
-            int nPoints = points.Count;
-
-            bool edge_connects(Point p1, Point p2)
-            {
-                HashSet<BBox> edges_to_set(List<Edge> edges)
-                {
-                    return new HashSet<BBox>(edges.Select(obj_to_bbox));
-                }
+            var key = new Func<CharDict, object>(c => new { c.fontname, c.size, c.upright, c.text });
+            var posKey = new Func<CharDict, object>(c => new { c.doctop, c.x0 });
 
-                if (p1.X == p2.X)
-                {
-                    var key1 = new Point(-1,-1);
-                    var key2 = new Point(-1,-1);
-                    foreach (var ikey in intersections.Keys)
-                    {
-                        if (ikey.EqualTo(p1))
-                        {
-                            key1 = ikey;
-                        }
-                        if (ikey.EqualTo(p2))
-                        {
-                            key2 = ikey;
-                        }
-                    }
-                    if (key1.X < 0 || key2.X < 0)
-                    {
-                        return false;
-                    }
-                    var common = edges_to_set(intersections[key1].VerticalEdges).Intersect(edges_to_set(intersections[key2].VerticalEdges)).ToList();
-                    if (common.Any()) return true;
-                }
+            var sortedChars = chars.OrderBy(key).ToList();
+            var uniqueChars = new List<CharDict>();
 
-                if (p1.Y == p2.Y)
+            foreach (var group in sortedChars.GroupBy(key))
+            {
+                var yClusters = TableHelpers.ClusterObjects(group.ToList(), c => c.doctop, tolerance);
+                foreach (var yCluster in yClusters)
                 {
-                    var key1 = new Point(-1, -1);
-                    var key2 = new Point(-1, -1);
-                    foreach (var ikey in intersections.Keys)
-                    {
-                        if (ikey.EqualTo(p1))
-                        {
-                            key1 = ikey;
-                        }
-                        if (ikey.EqualTo(p2))
-                        {
-                            key2 = ikey;
-                        }
-                    }
-                    if (key1.X < 0 || key2.X < 0)
+                    var xClusters = TableHelpers.ClusterObjects(yCluster, c => c.x0, tolerance);
+                    foreach (var xCluster in xClusters)
                     {
-                        return false;
+                        uniqueChars.Add(xCluster.OrderBy(c => posKey(c)).First());
                     }
-                    var common = edges_to_set(intersections[key1].HorizontalEdges).Intersect(edges_to_set(intersections[key2].HorizontalEdges)).ToList();
-                    if (common.Any()) return true;
                 }
-
-                return false;
             }
 
-            BBox find_smallest_cell(int i)
-            {
-                if (i == nPoints - 1) return null;
+            return uniqueChars.OrderBy(c => chars.IndexOf(c)).ToList();
+        }
+    }
 
-                var pt = points[i];
-                var rest = points.Skip(i + 1).ToList();
 
-                // Get all the points directly below and directly right
-                var below = rest.Where(x => x.X == pt.X).ToList();
-                var right = rest.Where(x => x.Y == pt.Y).ToList();
+    // Edge processing functions
+    internal static class EdgeProcessing
+    {
+        // line_to_edge - Convert line to edge
+        internal static Edge LineToEdge(Dictionary<string, object> line)
+        {
+            var edge = new Edge
+            {
+                x0 = Convert.ToSingle(line["x0"]),
+                x1 = Convert.ToSingle(line["x1"]),
+                top = Convert.ToSingle(line["top"]),
+                bottom = Convert.ToSingle(line["bottom"]),
+                width = line.ContainsKey("width") ? Convert.ToSingle(line["width"]) : 0,
+                height = line.ContainsKey("height") ? Convert.ToSingle(line["height"]) : 0,
+                orientation = Convert.ToSingle(line["top"]) == Convert.ToSingle(line["bottom"]) ? "h" : "v",
+                object_type = line.ContainsKey("object_type") ? line["object_type"].ToString() : "line",
+                doctop = line.ContainsKey("doctop") ? Convert.ToSingle(line["doctop"]) : 0,
+                page_number = line.ContainsKey("page_number") ? Convert.ToInt32(line["page_number"]) : 0,
+                y0 = line.ContainsKey("y0") ? Convert.ToSingle(line["y0"]) : 0,
+                y1 = line.ContainsKey("y1") ? Convert.ToSingle(line["y1"]) : 0
+            };
+            return edge;
+        }
 
-                foreach (var belowPt in below)
-                {
-                    if (!edge_connects(pt, belowPt)) continue;
+        // rect_to_edges - Convert rectangle to 4 edges
+        internal static List<Edge> RectToEdges(Dictionary<string, object> rect)
+        {
+            var edges = new List<Edge>();
+            float x0 = Convert.ToSingle(rect["x0"]);
+            float top = Convert.ToSingle(rect["top"]);
+            float x1 = Convert.ToSingle(rect["x1"]);
+            float bottom = Convert.ToSingle(rect["bottom"]);
+            float width = x1 - x0;
+            float height = bottom - top;
+            float doctop = rect.ContainsKey("doctop") ? Convert.ToSingle(rect["doctop"]) : top;
+
+            // Top edge
+            edges.Add(new Edge
+            {
+                x0 = x0,
+                x1 = x1,
+                top = bottom,
+                bottom = top,
+                width = width,
+                height = 0,
+                orientation = "h",
+                object_type = "rect_edge",
+                doctop = doctop,
+                y0 = bottom,
+                y1 = top
+            });
 
-                    foreach (var rightPt in right)
-                    {
-                        if (!edge_connects(pt, rightPt)) continue;
-
-                        Point bottomRight = new Point(rightPt.X, belowPt.Y);
-
-                        if (intersections.Keys.Any(p => p.EqualTo(rightPt))
-                            && edge_connects(bottomRight, rightPt)
-                            && edge_connects(bottomRight, belowPt))
-                        {
-                            return new BBox(pt.X, pt.Y, bottomRight.X, bottomRight.Y);
-                        }
-                    }
-                }
+            // Bottom edge
+            edges.Add(new Edge
+            {
+                x0 = x0,
+                x1 = x1,
+                top = top + height,
+                bottom = top + height,
+                width = width,
+                height = 0,
+                orientation = "h",
+                object_type = "rect_edge",
+                doctop = doctop + height,
+                y0 = top + height,
+                y1 = top + height
+            });
 
-                return null;
-            }
+            // Left edge
+            edges.Add(new Edge
+            {
+                x0 = x0,
+                x1 = x0,
+                top = top,
+                bottom = bottom,
+                width = 0,
+                height = height,
+                orientation = "v",
+                object_type = "rect_edge",
+                doctop = doctop,
+                y0 = bottom,
+                y1 = top
+            });
 
-            List<BBox> bBoxes = new List<BBox>();
-            for (int i = 0; i < points.Count; i++)
+            // Right edge
+            edges.Add(new Edge
             {
-                BBox bbox = find_smallest_cell(i);
-                if (bbox != null)
-                    bBoxes.Add(bbox);
-            }
-            return bBoxes;
+                x0 = x1,
+                x1 = x1,
+                top = top,
+                bottom = bottom,
+                width = 0,
+                height = height,
+                orientation = "v",
+                object_type = "rect_edge",
+                doctop = doctop,
+                y0 = bottom,
+                y1 = top
+            });
+
+            return edges;
         }
 
-        // Given a list of bounding boxes(`cells`), return a list of tables that
-        // hold those cells most simply(and contiguously).
-        public static List<List<BBox>> cells_to_tables(Page page, List<BBox> cells)
+        // curve_to_edges - Convert curve to edges
+        internal static List<Edge> CurveToEdges(Dictionary<string, object> curve)
         {
-            List<Point> bbox_to_corners(BBox bbox)
+            var edges = new List<Edge>();
+            var pts = curve["pts"] as List<object>;
+            if (pts == null) return edges;
+
+            float doctop = curve.ContainsKey("doctop") ? Convert.ToSingle(curve["doctop"]) : 0;
+            float top = curve.ContainsKey("top") ? Convert.ToSingle(curve["top"]) : 0;
+
+            for (int i = 0; i < pts.Count - 1; i++)
             {
-                // Decompose the bounding box into its individual components
-                float x0 = bbox.x0;
-                float top = bbox.top;
-                float x1 = bbox.x1;
-                float bottom = bbox.bottom;
+                var p0Obj = pts[i] as List<object>;
+                var p1Obj = pts[i + 1] as List<object>;
+                if (p0Obj == null || p1Obj == null || p0Obj.Count < 2 || p1Obj.Count < 2)
+                    continue;
+
+                float p0x = Convert.ToSingle(p0Obj[0]);
+                float p0y = Convert.ToSingle(p0Obj[1]);
+                float p1x = Convert.ToSingle(p1Obj[0]);
+                float p1y = Convert.ToSingle(p1Obj[1]);
+
+                string orientation = null;
+                if (p0x == p1x)
+                    orientation = "v";
+                else if (p0y == p1y)
+                    orientation = "h";
+
+                if (orientation == null) continue;
 
-                // Return the four corners as a list of tuples
-                return new List<Point>
+                edges.Add(new Edge
                 {
-                    new Point(x0, top),
-                    new Point(x0, bottom),
-                    new Point(x1, top),
-                    new Point(x1, bottom)
-                };
+                    x0 = Math.Min(p0x, p1x),
+                    x1 = Math.Max(p0x, p1x),
+                    top = Math.Min(p0y, p1y),
+                    bottom = Math.Max(p0y, p1y),
+                    width = Math.Abs(p0x - p1x),
+                    height = Math.Abs(p0y - p1y),
+                    orientation = orientation,
+                    object_type = "curve_edge",
+                    doctop = Math.Min(p0y, p1y) + (doctop - top),
+                    y0 = Math.Max(p0y, p1y),
+                    y1 = Math.Min(p0y, p1y)
+                });
             }
 
-            List<BBox> remainingCells = new List<BBox>(cells);
-            List<List<BBox>> tables = new List<List<BBox>>();
+            return edges;
+        }
 
-            // Iterate through the cells found above, and assign them
-            // to contiguous tables
-            HashSet<Point> currentCorners = new HashSet<Point>();
-            List<BBox> currentCells = new List<BBox>();
+        // obj_to_edges - Convert object to edges
+        internal static List<Edge> ObjToEdges(Dictionary<string, object> obj)
+        {
+            string objType = obj.ContainsKey("object_type") ? obj["object_type"].ToString() : "";
+            
+            if (objType.Contains("_edge"))
+                return new List<Edge> { LineToEdge(obj) };
+            else if (objType == "line")
+                return new List<Edge> { LineToEdge(obj) };
+            else if (objType == "rect")
+                return RectToEdges(obj);
+            else if (objType == "curve")
+                return CurveToEdges(obj);
+            
+            return new List<Edge>();
+        }
 
-            while (remainingCells.Count > 0)
+        // filter_edges - Filter edges by orientation, type, and min length
+        internal static List<Edge> FilterEdges(
+            List<Edge> edges,
+            string orientation = null,
+            string edgeType = null,
+            float minLength = 1)
+        {
+            if (orientation != null && orientation != "v" && orientation != "h")
+                throw new ArgumentException("Orientation must be 'v' or 'h'");
+
+            return edges.Where(e =>
             {
-                int initialCellCount = currentCells.Count;
+                string dim = e.orientation == "v" ? "height" : "width";
+                float dimValue = e.orientation == "v" ? e.height : e.width;
+                bool etCorrect = edgeType == null || e.object_type == edgeType;
+                bool orientCorrect = orientation == null || e.orientation == orientation;
+                return etCorrect && orientCorrect && dimValue >= minLength;
+            }).ToList();
+        }
 
-                foreach (var cell in new List<BBox>(remainingCells))
+        // snap_objects - Snap objects to their average position
+        internal static List<Dictionary<string, object>> SnapObjects(
+            IEnumerable<Dictionary<string, object>> objs,
+            string attr,
+            float tolerance)
+        {
+            string axis = attr == "x0" || attr == "x1" ? "h" : "v";
+            var objsList = objs.ToList();
+            var clusters = TableHelpers.ClusterObjects(objsList, obj => Convert.ToSingle(obj[attr]), tolerance);
+            var avgs = clusters.Select(cluster => cluster.Average(obj => Convert.ToSingle(obj[attr]))).ToList();
+            
+            var snappedClusters = new List<List<Dictionary<string, object>>>();
+            for (int i = 0; i < clusters.Count; i++)
+            {
+                float avg = avgs[i];
+                var snapped = clusters[i].Select(obj =>
                 {
-                    List<Point> cellCorners = bbox_to_corners(cell);
-                    // If we're just starting a table ...
-                    if (currentCells.Count == 0)
+                    var newObj = new Dictionary<string, object>(obj);
+                    float oldValue = Convert.ToSingle(obj[attr]);
+                    float diff = avg - oldValue;
+                    
+                    if (axis == "h")
                     {
-                        // ... immediately assign it to the empty group
-                        currentCorners.UnionWith(cellCorners);
-                        currentCells.Add(cell);
-                        remainingCells.Remove(cell);
+                        newObj["x0"] = Convert.ToSingle(obj["x0"]) + diff;
+                        newObj["x1"] = Convert.ToSingle(obj["x1"]) + diff;
                     }
                     else
                     {
-                        // How many corners does this table share with the current group?
-                        int cornerCount = cellCorners.Count(corner => currentCorners.Any(cc => cc.EqualTo(corner)));
-
-                        // If touching on at least one corner...
-                        if (cornerCount > 0)
-                        {
-                            // ... assign it to the current group
-                            currentCorners.UnionWith(cellCorners);
-                            currentCells.Add(cell);
-                            remainingCells.Remove(cell);
-                        }
+                        newObj["top"] = Convert.ToSingle(obj["top"]) + diff;
+                        newObj["bottom"] = Convert.ToSingle(obj["bottom"]) + diff;
+                        if (obj.ContainsKey("doctop"))
+                            newObj["doctop"] = Convert.ToSingle(obj["doctop"]) + diff;
+                        if (obj.ContainsKey("y0"))
+                            newObj["y0"] = Convert.ToSingle(obj["y0"]) - diff;
+                        if (obj.ContainsKey("y1"))
+                            newObj["y1"] = Convert.ToSingle(obj["y1"]) - diff;
                     }
-                }
-                // If this iteration did not find any more cells to append...
-                if (currentCells.Count == initialCellCount)
-                {
-                    tables.Add(new List<BBox>(currentCells));
-                    currentCorners.Clear();
-                    currentCells.Clear();
-                }
+                    return newObj;
+                }).ToList();
+                snappedClusters.Add(snapped);
             }
+            
+            return snappedClusters.SelectMany(x => x).ToList();
+        }
 
-            // Once we have exhausting the list of cells ...
-            // ... and we have a cell group that has not been stored
-            if (currentCells.Count > 0)
+        // snap_edges - Snap edges within tolerance
+        internal static List<Edge> SnapEdges(
+            List<Edge> edges,
+            float xTolerance = TableFlags.TABLE_DEFAULT_SNAP_TOLERANCE,
+            float yTolerance = TableFlags.TABLE_DEFAULT_SNAP_TOLERANCE)
+        {
+            var byOrientation = new Dictionary<string, List<Edge>>
             {
-                tables.Add(new List<BBox>(currentCells));
-            }
+                { "v", new List<Edge>() },
+                { "h", new List<Edge>() }
+            };
 
-            // remove tables without text or having only 1 column
-            for (int i = tables.Count - 1; i >= 0; i--)
-            {
-                var r = new BBox(0, 0, 0, 0); // EMPTY_RECT placeholder
-                var x1Vals = new HashSet<double>();
-                var x0Vals = new HashSet<double>();
+            foreach (var e in edges)
+                byOrientation[e.orientation].Add(e);
 
-                foreach (var cell in tables[i])
-                {
-                    r |= cell;
-                    x1Vals.Add(cell.x1);
-                    x0Vals.Add(cell.x0);
-                }
+            var snappedV = SnapEdgesByOrientation(byOrientation["v"], "x0", xTolerance);
+            var snappedH = SnapEdgesByOrientation(byOrientation["h"], "top", yTolerance);
+            
+            return snappedV.Concat(snappedH).ToList();
+        }
+
+        private static List<Edge> SnapEdgesByOrientation(List<Edge> edges, string attr, float tolerance)
+        {
+            if (edges.Count == 0) return edges;
+
+            var clusters = TableHelpers.ClusterObjects(edges, e => GetEdgeValue(e, attr), tolerance);
+            var avgs = clusters.Select(cluster => cluster.Average(e => GetEdgeValue(e, attr))).ToList();
 
-                string rText = page.GetTextbox(new Rect(r.x0, r.top, r.x1, r.bottom));
-                if (x1Vals.Count < 2 || x0Vals.Count < 2 || whiteSpaces_issuperset(rText))
+            var result = new List<Edge>();
+            for (int i = 0; i < clusters.Count; i++)
+            {
+                float avg = avgs[i];
+                foreach (var e in clusters[i])
                 {
-                    tables.RemoveAt(i);
+                    var snapped = new Edge
+                    {
+                        x0 = e.x0,
+                        x1 = e.x1,
+                        top = e.top,
+                        bottom = e.bottom,
+                        width = e.width,
+                        height = e.height,
+                        orientation = e.orientation,
+                        object_type = e.object_type,
+                        doctop = e.doctop,
+                        page_number = e.page_number,
+                        y0 = e.y0,
+                        y1 = e.y1
+                    };
+
+                    float diff = avg - GetEdgeValue(e, attr);
+                    if (attr == "x0")
+                    {
+                        snapped.x0 = avg;
+                        snapped.x1 = e.x1 + diff;
+                        snapped.width = snapped.x1 - snapped.x0;
+                    }
+                    else if (attr == "top")
+                    {
+                        snapped.top = avg;
+                        snapped.bottom = e.bottom + diff;
+                        snapped.height = snapped.bottom - snapped.top;
+                        snapped.doctop = e.doctop + diff;
+                    }
+
+                    result.Add(snapped);
                 }
             }
 
-            // Sort the tables top-to-bottom-left-to-right based on the value of the
-            // topmost-and-then-leftmost coordinate of a table.
-            var sortedTables = tables.OrderBy(t => t.Min(c => c.top))
-                                     .ThenBy(t => t.Min(c => c.x0))
-                                     .ToList();
-
-            return sortedTables;
-        }
-
-        public static List<Character> extract_words(List<Character> chars, Dictionary<string, object> kwargs)
-        {
-            // WordExtractor parameters
-            float x_tolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE;
-            float y_tolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE;
-            bool keep_blank_chars = false;
-            bool use_text_flow = false;
-            bool horizontal_ltr = true;
-            bool vertical_ttb = false;
-            List<string> extra_attrs = null;
-            bool split_at_punctuation = false;
-            bool expand_ligatures = true;
-
-            foreach (string key in kwargs.Keys)
-            {
-                switch (key)
-                {
-                    case "x_tolerance":
-                        x_tolerance = float.Parse(kwargs[key].ToString(), System.Globalization.CultureInfo.InvariantCulture); break;
-                    case "y_tolerance":
-                        y_tolerance = float.Parse(kwargs[key].ToString(), System.Globalization.CultureInfo.InvariantCulture); break;
-                    case "keep_blank_chars":
-                        keep_blank_chars = bool.Parse(kwargs[key].ToString()); break;
-                    case "use_text_flow":
-                        use_text_flow = bool.Parse(kwargs[key].ToString()); break;
-                    case "horizontal_ltr":
-                        horizontal_ltr = bool.Parse(kwargs[key].ToString()); break;
-                    case "vertical_ttb":
-                        vertical_ttb = bool.Parse(kwargs[key].ToString()); break;
-                    case "extra_attrs":
-                        extra_attrs = (List<string>)kwargs[key]; break;
-                    case "split_at_punctuation":
-                        split_at_punctuation = bool.Parse(kwargs[key].ToString()); break;
-                    case "expand_ligatures":
-                        expand_ligatures = bool.Parse(kwargs[key].ToString()); break;
-                    default:
-                        break;
-                }
-            }
-
-            WordExtractor extractor = new WordExtractor(
-                x_tolerance,
-                y_tolerance,
-                keep_blank_chars,
-                use_text_flow,
-                horizontal_ltr,
-                vertical_ttb,
-                extra_attrs,
-                split_at_punctuation,
-                expand_ligatures
-            );
-            
-            return extractor.extract_words(chars);
+            return result;
         }
 
-        public static TextMap chars_to_textmap(List<Character> chars, Dictionary<string, object> kwargs)
+        private static float GetEdgeValue(Edge e, string attr)
         {
-            // Add the presorted parameter
-            kwargs["presorted"] = true;
+            switch (attr)
+            {
+                case "x0":
+                    return e.x0;
+                case "top":
+                    return e.top;
+                default:
+                    return 0;
+            }
+        }
 
-            // WordExtractor parameters
-            float x_tolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE;
-            float y_tolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE;
-            bool keep_blank_chars = false;
-            bool use_text_flow = false;
-            bool horizontal_ltr = true;
-            bool vertical_ttb = false;
-            List<string> extra_attrs = null;
-            bool split_at_punctuation = false;
-            bool expand_ligatures = true;
-
-            // WordMap parameters
-            bool layout = false;
-            float layout_width = 0f;
-            float layout_height = 0f;
-            int layout_width_chars = 0;
-            int layout_height_chars = 0;
-            float x_density = TableFlags.TABLE_DEFAULT_X_DENSITY;
-            float y_density = TableFlags.TABLE_DEFAULT_Y_DENSITY;
-            float x_shift = 0;
-            float y_shift = 0;
-            bool presorted = false;
-
-            foreach (string key in kwargs.Keys)
-            {
-                switch (key)
-                {
-                    case "x_tolerance":
-                        x_tolerance = (float)kwargs[key]; break;
-                    case "y_tolerance":
-                        y_tolerance = (float)kwargs[key]; break;
-                    case "keep_blank_chars":
-                        keep_blank_chars = (bool)kwargs[key]; break;
-                    case "use_text_flow":
-                        use_text_flow = (bool)kwargs[key]; break;
-                    case "horizontal_ltr":
-                        horizontal_ltr = (bool)kwargs[key]; break;
-                    case "vertical_ttb":
-                        vertical_ttb = (bool)kwargs[key]; break;
-                    case "extra_attrs":
-                        extra_attrs = (List<string>)kwargs[key]; break;
-                    case "split_at_punctuation":
-                        split_at_punctuation = (bool)kwargs[key]; break;
-                    case "expand_ligatures":
-                        expand_ligatures = (bool)kwargs[key]; break;
-                    case "layout":
-                        layout = (bool)kwargs[key]; break;
-                    case "layout_width":
-                        layout_width = (float)kwargs[key]; break;
-                    case "layout_height":
-                        layout_height = (float)kwargs[key]; break;
-                    case "layout_width_chars":
-                        layout_width_chars = (int)kwargs[key]; break;
-                    case "layout_height_chars":
-                        layout_height_chars = (int)kwargs[key]; break;
-                    case "x_density":
-                        x_density = (float)kwargs[key]; break;
-                    case "y_density":
-                        y_density = (float)kwargs[key]; break;
-                    case "x_shift":
-                        x_shift = (float)kwargs[key]; break;
-                    case "y_shift":
-                        y_shift = (float)kwargs[key]; break;
-                    case "presorted":
-                        presorted = (bool)kwargs[key]; break;
-                    default:
-                        break;
-                }
-            }
-
-            WordExtractor extractor = new WordExtractor(
-                x_tolerance,
-                y_tolerance,
-                keep_blank_chars,
-                use_text_flow,
-                horizontal_ltr,
-                vertical_ttb,
-                extra_attrs,
-                split_at_punctuation,
-                expand_ligatures
-            );
+        // resize_object - Resize an object by changing a key value
+        internal static Dictionary<string, object> ResizeObject(Dictionary<string, object> obj, string key, float value)
+        {
+            if (key != "x0" && key != "x1" && key != "top" && key != "bottom")
+                throw new ArgumentException("Key must be 'x0', 'x1', 'top', or 'bottom'");
 
-            WordMap wordmap = extractor.extract_wordmap(chars);
-
-            TextMap textmap = wordmap.to_textmap(
-                layout,
-                layout_width,
-                layout_height,
-                layout_width_chars,
-                layout_height_chars,
-                x_density,
-                y_density,
-                x_shift,
-                y_shift,
-                y_tolerance,
-                use_text_flow,
-                presorted,
-                expand_ligatures
-            );
+            var newObj = new Dictionary<string, object>(obj);
+            float oldValue = Convert.ToSingle(obj[key]);
+            float diff = value - oldValue;
+
+            newObj[key] = value;
 
-            return textmap;
-        }
-
-        public static string extract_text(List<Character> chars, Dictionary<string, object> kwargs)
-        {
-            // WordExtractor parameters
-            float x_tolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE;
-            float y_tolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE;
-            bool keep_blank_chars = false;
-            bool use_text_flow = false;
-            bool horizontal_ltr = true;
-            bool vertical_ttb = false;
-            List<string> extra_attrs = null;
-            bool split_at_punctuation = false;
-            bool expand_ligatures = true;
-
-            // WordMap parameters
-            bool layout = false;
-            float layout_width = 0f;
-            float layout_height = 0f;
-            int layout_width_chars = 0;
-            int layout_height_chars = 0;
-            float x_density = TableFlags.TABLE_DEFAULT_X_DENSITY;
-            float y_density = TableFlags.TABLE_DEFAULT_Y_DENSITY;
-            float x_shift = 0;
-            float y_shift = 0;
-            bool presorted = false;
-
-            foreach (string key in kwargs.Keys)
-            {
-                switch (key)
-                {
-                    case "x_tolerance":
-                        x_tolerance = (float)kwargs[key]; break;
-                    case "y_tolerance":
-                        y_tolerance = (float)kwargs[key]; break;
-                    case "keep_blank_chars":
-                        keep_blank_chars = (bool)kwargs[key]; break;
-                    case "use_text_flow":
-                        use_text_flow = (bool)kwargs[key]; break;
-                    case "horizontal_ltr":
-                        horizontal_ltr = (bool)kwargs[key]; break;
-                    case "vertical_ttb":
-                        vertical_ttb = (bool)kwargs[key]; break;
-                    case "extra_attrs":
-                        extra_attrs = (List<string>)kwargs[key]; break;
-                    case "split_at_punctuation":
-                        split_at_punctuation = (bool)kwargs[key]; break;
-                    case "expand_ligatures":
-                        expand_ligatures = (bool)kwargs[key]; break;
-                    case "layout":
-                        layout = (bool)kwargs[key]; break;
-                    case "layout_width":
-                        layout_width = (float)kwargs[key]; break;
-                    case "layout_height":
-                        layout_height = (float)kwargs[key]; break;
-                    case "layout_width_chars":
-                        layout_width_chars = (int)kwargs[key]; break;
-                    case "layout_height_chars":
-                        layout_height_chars = (int)kwargs[key]; break;
-                    case "x_density":
-                        x_density = (float)kwargs[key]; break;
-                    case "y_density":
-                        y_density = (float)kwargs[key]; break;
-                    case "x_shift":
-                        x_shift = (float)kwargs[key]; break;
-                    case "y_shift":
-                        y_shift = (float)kwargs[key]; break;
-                    case "presorted":
-                        presorted = (bool)kwargs[key]; break;
-                    default:
-                        break;
-                }
-            }
-
-            if (chars.Count == 0)
+            if (key == "x0")
             {
-                return "";
+                if (value > Convert.ToSingle(obj["x1"]))
+                    throw new ArgumentException("x0 must be <= x1");
+                newObj["width"] = Convert.ToSingle(obj["x1"]) - value;
             }
+            else if (key == "x1")
+            {
+                if (value < Convert.ToSingle(obj["x0"]))
+                    throw new ArgumentException("x1 must be >= x0");
+                newObj["width"] = value - Convert.ToSingle(obj["x0"]);
+            }
+            else if (key == "top")
+            {
+                if (value > Convert.ToSingle(obj["bottom"]))
+                    throw new ArgumentException("top must be <= bottom");
+                newObj["doctop"] = Convert.ToSingle(obj["doctop"]) + diff;
+                newObj["height"] = Convert.ToSingle(obj["height"]) - diff;
+                if (obj.ContainsKey("y1"))
+                    newObj["y1"] = Convert.ToSingle(obj["y1"]) - diff;
+            }
+            else if (key == "bottom")
+            {
+                if (value < Convert.ToSingle(obj["top"]))
+                    throw new ArgumentException("bottom must be >= top");
+                newObj["height"] = Convert.ToSingle(obj["height"]) + diff;
+                if (obj.ContainsKey("y0"))
+                    newObj["y0"] = Convert.ToSingle(obj["y0"]) - diff;
+            }
+
+            return newObj;
+        }
 
-            // Layout handling
-            if (layout == true)
+        // join_edge_group - Join edges along the same line
+        internal static List<Edge> JoinEdgeGroup_(
+            List<Edge> edges,
+            string orientation,
+            float tolerance = TableFlags.TABLE_DEFAULT_JOIN_TOLERANCE)
+        {
+            string minProp, maxProp;
+            if (orientation == "h")
+            {
+                minProp = "x0";
+                maxProp = "x1";
+            }
+            else if (orientation == "v")
             {
-                return chars_to_textmap(chars, kwargs).AsString;
+                minProp = "top";
+                maxProp = "bottom";
             }
             else
             {
-                WordExtractor extractor = new WordExtractor(
-                    x_tolerance,
-                    y_tolerance,
-                    keep_blank_chars,
-                    use_text_flow,
-                    horizontal_ltr,
-                    vertical_ttb,
-                    extra_attrs,
-                    split_at_punctuation,
-                    expand_ligatures
-                );
+                throw new ArgumentException("Orientation must be 'v' or 'h'");
+            }
 
-                // Extract words using WordExtractor
-                List<Character> words = extractor.extract_words(chars);
-                // rotation cannot change within a cell
-                int rotation = words.Count > 0 ? (int)words[0].rotation : 0;
+            var sortedEdges = edges.OrderBy(e => GetEdgeValue(e, minProp)).ToList();
+            if (sortedEdges.Count == 0) return new List<Edge>();
 
-                string lines;
+            var joined = new List<Edge> { sortedEdges[0] };
 
-                if (rotation == 90)
-                {
-                    // Sort for rotation 90
-                    words = words.OrderBy(w => w.x1).ThenByDescending(w => w.top).ToList();
-                    lines = string.Join(" ", words.Select(w => w.text.ToString()));
-                }
-                else if (rotation == 270)
+            for (int i = 1; i < sortedEdges.Count; i++)
+            {
+                var e = sortedEdges[i];
+                var last = joined[joined.Count - 1];
+
+                float eMin = GetEdgeValue(e, minProp);
+                float lastMax = GetEdgeValue(last, maxProp);
+
+                if (eMin <= (lastMax + tolerance))
                 {
-                    // Sort for rotation 270
-                    words = words.OrderByDescending(w => w.x1).ThenBy(w => w.top).ToList();
-                    lines = string.Join(" ", words.Select(w => w.text.ToString()));
+                    float eMax = GetEdgeValue(e, maxProp);
+                    if (eMax > lastMax)
+                    {
+                        // Extend current edge
+                        var extended = new Edge
+                        {
+                            x0 = last.x0,
+                            x1 = last.x1,
+                            top = last.top,
+                            bottom = last.bottom,
+                            width = last.width,
+                            height = last.height,
+                            orientation = last.orientation,
+                            object_type = last.object_type,
+                            doctop = last.doctop,
+                            page_number = last.page_number,
+                            y0 = last.y0,
+                            y1 = last.y1
+                        };
+
+                        if (orientation == "h")
+                        {
+                            extended.x1 = e.x1;
+                            extended.width = extended.x1 - extended.x0;
+                        }
+                        else
+                        {
+                            extended.bottom = e.bottom;
+                            extended.height = extended.bottom - extended.top;
+                        }
+
+                        joined[joined.Count - 1] = extended;
+                    }
                 }
                 else
                 {
-                    // Cluster words based on doctop
-                    var linesGrouped = cluster_objects(words, obj=>obj.doctop, y_tolerance);
-                    lines = string.Join("\n", linesGrouped.Select(line => string.Join(" ", line.Select(w => w.text))));
+                    joined.Add(e);
+                }
+            }
+
+            return joined;
+        }
+
+        internal static List<Edge> JoinEdgeGroup(
+            List<Edge> edges,
+            string orientation,
+            float tolerance)
+        {
+            Func<Edge, float> minProp;
+            Func<Edge, float> maxProp;
+            Action<Edge, float> setMaxProp;
+
+            // Select properties based on orientation
+            if (orientation == "h")
+            {
+                minProp = e => e.x0;
+                maxProp = e => e.x1;
+                setMaxProp = (e, v) => e.x1 = v;
+            }
+            else if (orientation == "v")
+            {
+                minProp = e => e.top;
+                maxProp = e => e.bottom;
+                setMaxProp = (e, v) => e.bottom = v;
+            }
+            else
+            {
+                throw new ArgumentException("Orientation must be 'h' or 'v'");
+            }
 
-                    if (rotation == 180)
+            if (edges == null || edges.Count == 0)
+                return new List<Edge>();
+
+            // Sort edges by their minimum extent
+            var sortedEdges = edges
+                .OrderBy(minProp)
+                .ToList();
+
+            var joined = new List<Edge> { sortedEdges[0] };
+
+            // Merge overlapping / nearby edges
+            for (int i = 1; i < sortedEdges.Count; i++)
+            {
+                var current = sortedEdges[i];
+                var last = joined[joined.Count - 1];
+
+                if (minProp(current) <= maxProp(last) + tolerance)
+                {
+                    // Extend the last edge if needed
+                    if (maxProp(current) > maxProp(last))
                     {
-                        // Special handling for rotation 180 (reverse lines and replace newline with spaces)
-                        lines = new string(lines.Reverse().Select(c => c == '\n' ? ' ' : c).ToArray());
+                        setMaxProp(last, maxProp(current));
                     }
                 }
-
-                return lines;
+                else
+                {
+                    // Separate edge → start a new segment
+                    joined.Add(current);
+                }
             }
-        }
-    }
 
-    public class TextItem
-    {
-        public string Text { get; set; }
-        public object Obj { get; set; }
+            return joined;
+        }
 
-        public TextItem(string text, object obj)
+        // merge_edges - Merge edges using snap and join
+        internal static List<Edge> MergeEdges_(
+            List<Edge> edges,
+            float snapXTolerance,
+            float snapYTolerance,
+            float joinXTolerance,
+            float joinYTolerance)
         {
-            Text = text;
-            Obj = obj;
-        }
-    }
+            if (snapXTolerance > 0 || snapYTolerance > 0)
+                edges = SnapEdges(edges, snapXTolerance, snapYTolerance);
 
-    public class TextMap
-    {
-        public List<TextItem> Tuples { get; set; }
-        public string AsString { get; set; }
+            // Use Tuple for grouping key (matching Python's get_group function)
+            var sorted = edges.OrderBy(e => Tuple.Create(e.orientation, e.orientation == "h" ? e.top : e.x0)).ToList();
+            var edgeGroups = sorted.GroupBy(e => Tuple.Create(e.orientation, e.orientation == "h" ? e.top : e.x0));
 
-        public TextMap(List<TextItem> tuples = null)
+            var merged = new List<Edge>();
+            foreach (var group in edgeGroups)
+            {
+                string orientation = group.Key.Item1; // First element of tuple is orientation
+                float tolerance = orientation == "h" ? joinXTolerance : joinYTolerance;
+                merged.AddRange(JoinEdgeGroup(group.ToList(), orientation, tolerance));
+            }
+
+            return merged;
+        }
+
+        public static List<Edge> MergeEdges(
+            List<Edge> edges,
+            float snapXTolerance,
+            float snapYTolerance,
+            float joinXTolerance,
+            float joinYTolerance)
         {
-            Tuples = tuples ?? new List<TextItem>();
-            AsString = string.Join("", Tuples.Select(item => item.Text));
+            // Local grouping key (equivalent to Python get_group)
+            (string, float) GetGroupKey(Edge edge)
+            {
+                return edge.orientation == "h"
+                    ? ("h", edge.top)
+                    : ("v", edge.x0);
+            }
+
+            // Optional snapping
+            if (snapXTolerance > 0 || snapYTolerance > 0)
+            {
+                edges = SnapEdges(edges, snapXTolerance, snapYTolerance);
+            }
+
+            // Sort by group key
+            var sortedEdges = edges
+                .OrderBy(e => GetGroupKey(e).Item1)
+                .ThenBy(e => GetGroupKey(e).Item2)
+                .ToList();
+
+            // Group edges
+            var groupedEdges = sortedEdges
+                .GroupBy(GetGroupKey);
+
+            // Join edge groups
+            var mergedEdges = new List<Edge>();
+
+            foreach (var group in groupedEdges)
+            {
+                string orientation = group.Key.Item1;
+                float joinTolerance =
+                    orientation == "h" ? joinXTolerance : joinYTolerance;
+
+                var joined = JoinEdgeGroup(
+                    group.ToList(),
+                    orientation,
+                    joinTolerance
+                );
+
+                mergedEdges.AddRange(joined);
+            }
+
+            return mergedEdges;
         }
-    }
 
-    public class WordMap
-    {
-        public List<Tuple<Character, List<Character>>> Tuples { get; set; }
+        // bbox_to_rect - Convert bbox tuple to rect dict
+        internal static Dictionary<string, object> BboxToRect(Tuple<float, float, float, float> bbox)
+        {
+            return new Dictionary<string, object>
+            {
+                { "x0", bbox.Item1 },
+                { "top", bbox.Item2 },
+                { "x1", bbox.Item3 },
+                { "bottom", bbox.Item4 }
+            };
+        }
 
-        public WordMap(List<Tuple<Character, List<Character>>> tuples)
+        // objects_to_rect - Get smallest rect containing objects
+        internal static Dictionary<string, object> ObjectsToRect(IEnumerable<object> objects)
         {
-            Tuples = tuples;
+            var bbox = TableHelpers.ObjectsToBbox(objects);
+            return BboxToRect(Tuple.Create(bbox.X0, bbox.Y0, bbox.X1, bbox.Y1));
         }
 
-        public TextMap to_textmap(
-            bool layout = false,
-            float layoutWidth = 0,
-            float layoutHeight = 0,
-            int layoutWidthChars = 0,
-            int layoutHeightChars = 0,
-            float xDensity = TableFlags.TABLE_DEFAULT_X_DENSITY,
-            float yDensity = TableFlags.TABLE_DEFAULT_Y_DENSITY,
-            float xShift = 0,
-            float yShift = 0,
-            float yTolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE,
-            bool useTextFlow = false,
-            bool presorted = false,
-            bool expandLigatures = true
-        )
+        // merge_bboxes - Merge multiple bboxes
+        internal static Tuple<float, float, float, float> MergeBboxes(IEnumerable<Tuple<float, float, float, float>> bboxes)
         {
-            var textMap = new List<TextItem>();
+            var bboxList = bboxes.ToList();
+            if (bboxList.Count == 0)
+                return Tuple.Create(0f, 0f, 0f, 0f);
 
-            if (Tuples.Count == 0)
-                return new TextMap(textMap);
+            return Tuple.Create(
+                bboxList.Min(b => b.Item1),
+                bboxList.Min(b => b.Item2),
+                bboxList.Max(b => b.Item3),
+                bboxList.Max(b => b.Item4)
+            );
+        }
 
-            var expansions = expandLigatures ? TableFlags.TABLE_LIGATURES : new Dictionary<string, string>();
+        // words_to_edges_h - Find horizontal edges from words
+        internal static List<Edge> WordsToEdgesH(
+            List<Dictionary<string, object>> words,
+            int wordThreshold = (int)TableFlags.TABLE_DEFAULT_MIN_WORDS_HORIZONTAL)
+        {
+            var byTop = TableHelpers.ClusterObjects(words, w => Convert.ToSingle(w["top"]), 1);
+            var largeClusters = byTop.Where(x => x.Count >= wordThreshold).ToList();
+            
+            if (largeClusters.Count == 0)
+                return new List<Edge>();
 
-            // Layout handling
-            if (layout)
+            var rects = largeClusters.Select(cluster => ObjectsToRect(cluster.Cast<object>())).ToList();
+            float minX0 = rects.Min(r => Convert.ToSingle(r["x0"]));
+            float maxX1 = rects.Max(r => Convert.ToSingle(r["x1"]));
+
+            var edges = new List<Edge>();
+            foreach (var r in rects)
             {
-                if (layoutWidthChars > 0)
-                {
-                    if (layoutWidth > 0)
-                    {
-                        throw new ArgumentException("`layoutWidth` and `layoutWidthChars` cannot both be set.");
-                    }
-                }
-                else
-                {
-                    layoutWidthChars = (int)Math.Round(layoutWidth / xDensity);
-                }
+                float top = Convert.ToSingle(r["top"]);
+                float bottom = Convert.ToSingle(r["bottom"]);
 
-                if (layoutHeightChars > 0)
+                // Top edge
+                edges.Add(new Edge
                 {
-                    if (layoutHeight > 0)
-                    {
-                        throw new ArgumentException("`layoutHeight` and `layoutHeightChars` cannot both be set.");
-                    }
-                }
-                else
+                    x0 = minX0,
+                    x1 = maxX1,
+                    top = top,
+                    bottom = top,
+                    width = maxX1 - minX0,
+                    height = 0,
+                    orientation = "h",
+                    object_type = "text_edge"
+                });
+
+                // Bottom edge
+                edges.Add(new Edge
                 {
-                    layoutHeightChars = (int)Math.Round(layoutHeight / yDensity);
-                }
+                    x0 = minX0,
+                    x1 = maxX1,
+                    top = bottom,
+                    bottom = bottom,
+                    width = maxX1 - minX0,
+                    height = 0,
+                    orientation = "h",
+                    object_type = "text_edge"
+                });
             }
 
-            int numNewlines = 0;
-            var wordsSortedDoctop = presorted || useTextFlow
-                ? Tuples
-                : Tuples.OrderBy(t => t.Item1.doctop).ToList();
+            return edges;
+        }
 
-            Character firstWord = wordsSortedDoctop[0].Item1;
-            float doctopStart = firstWord.doctop - firstWord.top;
+        // get_bbox_overlap - Get overlap between two bboxes
+        internal static Tuple<float, float, float, float> GetBboxOverlap(
+            Tuple<float, float, float, float> a,
+            Tuple<float, float, float, float> b)
+        {
+            float oLeft = Math.Max(a.Item1, b.Item1);
+            float oRight = Math.Min(a.Item3, b.Item3);
+            float oBottom = Math.Min(a.Item4, b.Item4);
+            float oTop = Math.Max(a.Item2, b.Item2);
+            float oWidth = oRight - oLeft;
+            float oHeight = oBottom - oTop;
+
+            if (oHeight >= 0 && oWidth >= 0 && oHeight + oWidth > 0)
+                return Tuple.Create(oLeft, oTop, oRight, oBottom);
+            
+            return null;
+        }
+
+        // words_to_edges_v - Find vertical edges from words
+        internal static List<Edge> WordsToEdgesV(
+            List<Dictionary<string, object>> words,
+            int wordThreshold = (int)TableFlags.TABLE_DEFAULT_MIN_WORDS_VERTICAL)
+        {
+            var byX0 = TableHelpers.ClusterObjects(words, w => Convert.ToSingle(w["x0"]), 1);
+            var byX1 = TableHelpers.ClusterObjects(words, w => Convert.ToSingle(w["x1"]), 1);
+            
+            Func<Dictionary<string, object>, float> getCenter = w => 
+                (Convert.ToSingle(w["x0"]) + Convert.ToSingle(w["x1"])) / 2;
+            var byCenter = TableHelpers.ClusterObjects(words, getCenter, 1);
+
+            var clusters = byX0.Concat(byX1).Concat(byCenter).ToList();
+            var sortedClusters = clusters.OrderByDescending(x => x.Count).ToList();
+            var largeClusters = sortedClusters.Where(x => x.Count >= wordThreshold).ToList();
 
-            int k = 0;
-            foreach (var ws in cluster_objects(wordsSortedDoctop, t => t.Item1.doctop, yTolerance))
+            if (largeClusters.Count == 0)
+                return new List<Edge>();
+
+            var bboxes = largeClusters.Select(cluster => 
             {
-                float yDist = layout
-                    ? (ws[0].Item1.doctop - (doctopStart + yShift)) / yDensity
-                    : 0;
+                var rect = ObjectsToRect(cluster.Cast<object>());
+                return Tuple.Create(
+                    Convert.ToSingle(rect["x0"]),
+                    Convert.ToSingle(rect["top"]),
+                    Convert.ToSingle(rect["x1"]),
+                    Convert.ToSingle(rect["bottom"])
+                );
+            }).ToList();
 
-                int numNewlinesPrepend = Math.Max(k > 0 ? 1 : 0, (int)Math.Round(yDist) - numNewlines);
-                k++;
-                for (int i = 0; i < numNewlinesPrepend; i++)
-                {
-                    if (textMap.Count == 0 || textMap.Last().Text == "\n")
-                    {
-                        textMap.Add(new TextItem(" ", null));  // Blank line handling
-                    }
-                    textMap.Add(new TextItem("\n", null));  // Add newline
-                }
-                numNewlines += numNewlinesPrepend;
+            var condensedBboxes = new List<Tuple<float, float, float, float>>();
+            foreach (var bbox in bboxes)
+            {
+                bool hasOverlap = condensedBboxes.Any(c => GetBboxOverlap(bbox, c) != null);
+                if (!hasOverlap)
+                    condensedBboxes.Add(bbox);
+            }
 
-                float lineLen = 0;
+            if (condensedBboxes.Count == 0)
+                return new List<Edge>();
 
-                var lineWordsSortedX0 = presorted || useTextFlow
-                    ? ws
-                    : ws.OrderBy(w => w.Item1.x0).ToList();
+            var condensedRects = condensedBboxes.Select(bbox => BboxToRect(bbox))
+                .OrderBy(r => Convert.ToSingle(r["x0"])).ToList();
+
+            float maxX1 = condensedRects.Max(r => Convert.ToSingle(r["x1"]));
+            float minTop = condensedRects.Min(r => Convert.ToSingle(r["top"]));
+            float maxBottom = condensedRects.Max(r => Convert.ToSingle(r["bottom"]));
 
-                foreach (var word in lineWordsSortedX0)
+            var edges = new List<Edge>();
+            foreach (var r in condensedRects)
+            {
+                edges.Add(new Edge
                 {
-                    var wordObj = word.Item1;
-                    float xDist = layout ? (wordObj.x0 - xShift) / xDensity : 0;
-                    int numSpacesPrepend = Math.Max(Math.Min(1, (int)lineLen), (int)Math.Round(xDist) - (int)lineLen);
-                    for (int i = 0; i < numSpacesPrepend; i++)
-                    {
-                        textMap.Add(new TextItem(" ", null));  // Add spaces before the word
-                    }
-                    lineLen += numSpacesPrepend;
+                    x0 = Convert.ToSingle(r["x0"]),
+                    x1 = Convert.ToSingle(r["x0"]),
+                    top = minTop,
+                    bottom = maxBottom,
+                    width = 0,
+                    height = maxBottom - minTop,
+                    orientation = "v",
+                    object_type = "text_edge"
+                });
+            }
 
-                    foreach (Character c in word.Item2)
-                    {
-                        string letters = expansions.ContainsKey(c.text) ? expansions[c.text] : c.text;
-                        foreach (var letter in letters)
-                        {
-                            textMap.Add(new TextItem(letter.ToString(), c));  // Add each letter
-                            lineLen += 1;
-                        }
-                    }
-                }
+            // Add rightmost edge
+            edges.Add(new Edge
+            {
+                x0 = maxX1,
+                x1 = maxX1,
+                top = minTop,
+                bottom = maxBottom,
+                width = 0,
+                height = maxBottom - minTop,
+                orientation = "v",
+                object_type = "text_edge"
+            });
 
-                // Add spaces at the end of the line if layout
-                if (layout)
+            return edges;
+        }
+
+        // edges_to_intersections - Find intersection points of edges
+        internal static Dictionary<Tuple<float, float>, Dictionary<string, List<Edge>>> EdgesToIntersections(
+            List<Edge> edges,
+            float xTolerance = 1,
+            float yTolerance = 1)
+        {
+            var intersections = new Dictionary<Tuple<float, float>, Dictionary<string, List<Edge>>>();
+            var vEdges = edges.Where(e => e.orientation == "v")
+                .OrderBy(e => e.x0).ThenBy(e => e.top).ToList();
+            var hEdges = edges.Where(e => e.orientation == "h")
+                .OrderBy(e => e.top).ThenBy(e => e.x0).ToList();
+
+            foreach (var v in vEdges)
+            {
+                foreach (var h in hEdges)
                 {
-                    for (int i = 0; i < (layoutWidthChars - (int)lineLen); i++)
+                    if ((v.top <= (h.top + yTolerance)) &&
+                        (v.bottom >= (h.top - yTolerance)) &&
+                        (v.x0 >= (h.x0 - xTolerance)) &&
+                        (v.x0 <= (h.x1 + xTolerance)))
                     {
-                        textMap.Add(new TextItem(" ", null));
+                        var vertex = Tuple.Create(v.x0, h.top);
+                        if (!intersections.ContainsKey(vertex))
+                        {
+                            intersections[vertex] = new Dictionary<string, List<Edge>>
+                            {
+                                { "v", new List<Edge>() },
+                                { "h", new List<Edge>() }
+                            };
+                        }
+                        intersections[vertex]["v"].Add(v);
+                        intersections[vertex]["h"].Add(h);
                     }
                 }
             }
 
-            // Append blank lines at the end of text
-            if (layout)
+            return intersections;
+        }
+
+        // intersections_to_cells - Convert intersections to cells
+        internal static List<Rect> IntersectionsToCells_(
+            Dictionary<Tuple<float, float>, Dictionary<string, List<Edge>>> intersections)
+        {
+            var cells = new List<Rect>();
+            var points = intersections.Keys.OrderBy(p => p.Item2).ThenBy(p => p.Item1).ToList();
+            int nPoints = points.Count;
+
+            Func<Tuple<float, float>, Tuple<float, float>, bool> edgeConnects = (p1, p2) =>
             {
-                int numNewlinesAppend = layoutHeightChars - (numNewlines + 1);
-                for (int i = 0; i < numNewlinesAppend; i++)
+                Func<List<Edge>, HashSet<Tuple<float, float, float, float>>> edgesToSet = edges =>
+                {
+                    return new HashSet<Tuple<float, float, float, float>>(edges.Select(e =>
+                        Tuple.Create(e.x0, e.top, e.x1, e.bottom)));
+                };
+
+                if (p1.Item1 == p2.Item1) // Same x
                 {
-                    if (i > 0)
-                    {
-                        textMap.Add(new TextItem(" ", null));  // Blank line at the end
-                    }
-                    textMap.Add(new TextItem("\n", null));  // Add newline
+                    var common = new HashSet<Tuple<float, float, float, float>>(edgesToSet(intersections[p1]["v"]));
+                    common.IntersectWith(edgesToSet(intersections[p2]["v"]));
+                    if (common.Count > 0)
+                        return true;
                 }
 
-                // Remove the last newline if present
-                if (textMap.Last().Text == "\n")
+                if (p1.Item2 == p2.Item2) // Same y
                 {
-                    textMap.RemoveAt(textMap.Count - 1);
+                    var common = new HashSet<Tuple<float, float, float, float>>(edgesToSet(intersections[p1]["h"]));
+                    common.IntersectWith(edgesToSet(intersections[p2]["h"]));
+                    if (common.Count > 0)
+                        return true;
                 }
-            }
-
-            return new TextMap(textMap);
-        }
-    }
 
-    public class WordExtractor
-    {
-        public float xTolerance;
-        public float yTolerance;
-        public bool keepBlankChars;
-        public bool useTextFlow;
-        public bool horizontalLtr; // Should words be read left-to-right?
-        public bool verticalTtb;   // Should vertical words be read top-to-bottom?
-        public List<string> extraAttrs;
-        public string splitAtPunctuation;
-        public Dictionary<string, string> expansions;
+                return false;
+            };
 
-        public WordExtractor(
-            float xTolerance = TableFlags.TABLE_DEFAULT_X_TOLERANCE,
-            float yTolerance = TableFlags.TABLE_DEFAULT_Y_TOLERANCE,
-            bool keepBlankChars = false,
-            bool useTextFlow = false,
-            bool horizontalLtr = true,
-            bool verticalTtb = false,
-            List<string> extraAttrs = null,
-            bool splitAtPunctuation = false,
-            bool expandLigatures = true
-        )
-        {
-            this.xTolerance = xTolerance;
-            this.yTolerance = yTolerance;
-            this.keepBlankChars = keepBlankChars;
-            this.useTextFlow = useTextFlow;
-            this.horizontalLtr = horizontalLtr;
-            this.verticalTtb = verticalTtb;
-            this.extraAttrs = extraAttrs ?? new List<string>();
-            this.splitAtPunctuation = splitAtPunctuation ? string.Join("", new[] { '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~' }) : "";
-            this.expansions = expandLigatures ? TableFlags.TABLE_LIGATURES : new Dictionary<string, string>();
-        }
+            for (int i = 0; i < nPoints - 1; i++)
+            {
+                var pt = points[i];
+                var rest = points.Skip(i + 1).ToList();
 
-        public Character merge_chars(List<Character> orderedChars)
-        {
-            float x0, top, x1, bottom;
-            BBox bbox = objects_to_bbox(orderedChars);
-            x0 = bbox.x0; top = bbox.top; x1 = bbox.x1; bottom = bbox.bottom;
-            float doctopAdj = orderedChars[0].doctop - orderedChars[0].top;
-            bool upright = orderedChars[0].upright;
-            int direction = (this.horizontalLtr ? 1 : -1) * (upright ? 1 : -1);
+                var below = rest.Where(x => x.Item1 == pt.Item1).ToList();
+                var right = rest.Where(x => x.Item2 == pt.Item2).ToList();
 
-            Matrix matrix = orderedChars[0].matrix;
+                foreach (var belowPt in below)
+                {
+                    if (!edgeConnects(pt, belowPt))
+                        continue;
 
-            int rotation = 0;
-            if (!upright && matrix[1] < 0)
-            {
-                orderedChars.Reverse();
-                rotation = 270;
-            }
-            else if (matrix[0] < 0 && matrix[3] < 0)
-            {
-                rotation = 180;
-            }
-            else if (matrix[1] > 0)
-            {
-                rotation = 90;
-            }
+                    foreach (var rightPt in right)
+                    {
+                        if (!edgeConnects(pt, rightPt))
+                            continue;
 
-            var word = new Character { 
-                text = string.Join("", orderedChars.Select(c => expansions.ContainsKey(c.text) ? expansions[c.text] : c.text)),
-                x0 = x0,
-                x1 = x1,
-                top = top,
-                doctop = top + doctopAdj,
-                bottom = bottom,
-                upright = upright,
-                direction = direction,
-                rotation = rotation
-            };
+                        var bottomRight = Tuple.Create(rightPt.Item1, belowPt.Item2);
 
-            foreach (var key in this.extraAttrs)
-            {
-                var val = orderedChars[0].GetType().GetProperty(key).GetValue(orderedChars[0]);
-                word.GetType().GetProperty(key).SetValue(word, val);
+                        if (intersections.ContainsKey(bottomRight) &&
+                            edgeConnects(bottomRight, rightPt) &&
+                            edgeConnects(bottomRight, belowPt))
+                        {
+                            cells.Add(new Rect(pt.Item1, pt.Item2, bottomRight.Item1, bottomRight.Item2));
+                        }
+                    }
+                }
             }
 
-            return word;
+            return cells;
         }
 
-        // This method takes several factors into account to determine if
-        // `curr_char` represents the beginning of a new word:
-        // - Whether the text is "upright" (i.e., non-rotated)
-        // - Whether the user has specified that horizontal text runs
-        //   left-to-right(default) or right-to-left, as represented by
-        //   self.horizontal_ltr
-        // - Whether the user has specified that vertical text the text runs
-        //   top-to-bottom(default) or bottom-to-top, as represented by
-        //   self.vertical_ttb
-        // - The x0, top, x1, and bottom attributes of prev_char and
-        //   curr_char
-        // - The self.x_tolerance and self.y_tolerance settings. Note: In
-        //   this case, x/y refer to those directions for non-rotated text.
-        //   For vertical text, they are flipped.A more accurate terminology
-        //   might be "*intra*line character distance tolerance" and
-        //   "*inter*line character distance tolerance"
-        // An important note: The* intra*line distance is measured from the
-        // * end* of the previous character to the *beginning* of the current
-        // character, while the* inter*line distance is measured from the
-        // * top* of the previous character to the *top* of the next
-        // character.The reasons for this are partly repository-historical,
-        // and partly logical, as successive text lines' bounding boxes often
-        // overlap slightly (and we don't want that overlap to be interpreted
-        // as the two lines being the same line).
-        // The upright-ness of the character determines the attributes to
-        // compare, while horizontal_ltr/vertical_ttb determine the direction
-        // of the comparison.
-        public bool char_begins_new_word(Character prevChar, Character currChar)
-        {
-            float x, y, ay, cy, ax, bx, cx;
-        
-            // Note: Due to the grouping step earlier in the process,
-            // curr_char["upright"] will always equal prev_char["upright"].
-            if (currChar.upright == true)
+        internal static List<Rect> IntersectionsToCells(
+            Dictionary<Tuple<float, float>, Dictionary<string, List<Edge>>> intersections)
+        {
+            // ---------- edge_connects ----------
+            bool EdgeConnects(
+                Tuple<float, float> p1,
+                Tuple<float, float> p2)
             {
-                x = this.xTolerance;
-                y = this.yTolerance;
-                ay = prevChar.top;
-                cy = currChar.top;
-                if (horizontalLtr == true)
-                {
-                    ax = prevChar.x0;
-                    bx = prevChar.x1;
-                    cx = currChar.x0;
-                }
-                else 
+                HashSet<(float, float, float, float)> EdgesToSet(List<Edge> edges)
                 {
-                    ax = -prevChar.x1;
-                    bx = -prevChar.x0;
-                    cx = -currChar.x1;
+                    var set = new HashSet<(float, float, float, float)>();
+                    foreach (var e in edges)
+                        set.Add(ObjToBBox(e));
+                    return set;
                 }
-            }
-            else
-            {
-                x = this.yTolerance;
-                y = this.xTolerance;
-                ay = prevChar.x0;
-                cy = currChar.x0;
-                if (verticalTtb == true)
+
+                // Same X → vertical edges
+                if (p1.Item1 == p2.Item1)
                 {
-                    ax = prevChar.top;
-                    bx = prevChar.bottom;
-                    cx = currChar.top;
+                    var common = EdgesToSet(intersections[p1]["v"])
+                        .Intersect(EdgesToSet(intersections[p2]["v"]));
+
+                    if (common.Any())
+                        return true;
                 }
-                else
+
+                // Same Y → horizontal edges
+                if (p1.Item2 == p2.Item2)
                 {
-                    ax = -prevChar.bottom;
-                    bx = -prevChar.top;
-                    cx = -currChar.bottom;
+                    var common = EdgesToSet(intersections[p1]["h"])
+                        .Intersect(EdgesToSet(intersections[p2]["h"]));
+
+                    if (common.Any())
+                        return true;
                 }
+
+                return false;
             }
 
-            return (cx < ax) || (cx > bx + x) || (cy > ay + y);
-        }
+            var points = intersections.Keys
+                .OrderBy(p => p.Item1)
+                .ThenBy(p => p.Item2)
+                .ToList();
 
-        public IEnumerable<List<Character>> iter_chars_to_words(List<Character> orderedChars)
-        {
-            List<Character> currentWord = new List<Character>();
+            int nPoints = points.Count;
 
-            foreach (var charDict in orderedChars)
+            // ---------- find_smallest_cell ----------
+            Rect FindSmallestCell(int i)
             {
-                string text = charDict.text;
+                if (i == nPoints - 1)
+                    return null;
 
-                // If keep_blank_chars is false and the char is a space, we start the next word
-                if (!this.keepBlankChars && string.IsNullOrWhiteSpace(text))
-                {
-                    yield return currentWord; // Yield the current word
-                    currentWord.Clear();
-                }
+                var pt = points[i];
+                var rest = points.Skip(i + 1);
 
-                // If text is a punctuation mark, split the word
-                else if (this.splitAtPunctuation.Contains(text))
-                {
-                    yield return currentWord; // Yield the current word
-                    currentWord.Clear();
-                    currentWord.Add(charDict);  // Add punctuation as a new word
-                    yield return currentWord;  // Yield the punctuation as a word
-                    currentWord.Clear();
-                }
-                // Check if this character begins a new word
-                else if (currentWord.Count > 0 && char_begins_new_word(currentWord[currentWord.Count - 1], charDict))
-                {
-                    yield return currentWord; // Yield the current word
-                    currentWord.Clear();
-                    currentWord.Add(charDict);  // Start a new word with this char
-                }
-                else
+                var below = rest.Where(p => p.Item1 == pt.Item1).ToList();
+                var right = rest.Where(p => p.Item2 == pt.Item2).ToList();
+
+                foreach (var belowPt in below)
                 {
-                    currentWord.Add(charDict);  // Otherwise, just add the character to the current word
+                    if (!EdgeConnects(pt, belowPt))
+                        continue;
+
+                    foreach (var rightPt in right)
+                    {
+                        if (!EdgeConnects(pt, rightPt))
+                            continue;
+
+                        var bottomRight = Tuple.Create(rightPt.Item1, belowPt.Item2);
+
+                        if (intersections.ContainsKey(bottomRight) &&
+                            EdgeConnects(bottomRight, rightPt) &&
+                            EdgeConnects(bottomRight, belowPt))
+                        {
+                            float x0 = pt.Item1;
+                            float y0 = pt.Item2;
+                            float x1 = bottomRight.Item1;
+                            float y1 = bottomRight.Item2;
+
+                            return new Rect(
+                                x0,
+                                y0,
+                                x1,
+                                y1
+                            );
+                        }
+                    }
                 }
+
+                return null;
             }
 
-            // Yield the last word if it exists
-            if (currentWord.Count > 0)
+            // ---------- generate cells ----------
+            var cells = new List<Rect>();
+
+            for (int i = 0; i < points.Count; i++)
             {
-                yield return currentWord;
+                var cell = FindSmallestCell(i);
+                if (cell != null)
+                    cells.Add(cell);
             }
+
+            return cells;
         }
 
-        public IEnumerable<Character> iter_sort_chars(List<Character> chars)
+        // ---------- obj_to_bbox ----------
+        private static (float, float, float, float) ObjToBBox(Edge e)
         {
-            Func<Character, int> upright_key = x => -Convert.ToInt32(x.upright);
-
-            // Sort characters based on "upright"
-            var uprightClusters = chars
-                .GroupBy(x => x.upright)
-                .OrderByDescending(g => g.Key) // Group by "upright" key (1 for upright, 0 for non-upright)
-                .ToList();
+            return (e.x0, e.top, e.x1, e.bottom);
+        }
 
-            foreach (var uprightCluster in uprightClusters)
+        // cells_to_tables - Group cells into tables
+        internal static List<List<Rect>> CellsToTables(Page page, List<Rect> cells)
+        {
+            Func<Rect, List<Tuple<float, float>>> bboxToCorners = bbox =>
             {
-                bool upright = uprightCluster.Key;
-                string clusterKey = upright ? "doctop" : "x0"; // Define clustering key based on upright status
+                return new List<Tuple<float, float>>
+                {
+                    Tuple.Create(bbox.X0, bbox.Y0),
+                    Tuple.Create(bbox.X0, bbox.Y1),
+                    Tuple.Create(bbox.X1, bbox.Y0),
+                    Tuple.Create(bbox.X1, bbox.Y1)
+                };
+            };
+
+            var remainingCells = new List<Rect>(cells);
+            var currentCorners = new HashSet<Tuple<float, float>>();
+            var currentCells = new List<Rect>();
+            var tables = new List<List<Rect>>();
 
-                // Cluster by line using "doctop" for upright or "x0" for non-upright characters
-                var subclusters = uprightCluster
-                    .GroupBy(c => upright ? c.doctop : c.x0)
-                    .OrderBy(g => g.Key)
-                    .ToList();
+            while (remainingCells.Count > 0)
+            {
+                int initialCellCount = currentCells.Count;
+                var cellsToRemove = new List<Rect>();
 
-                foreach (var subcluster in subclusters)
+                foreach (var cell in remainingCells)
                 {
-                    // Sort within each subcluster
-                    var sortedChars = subcluster.OrderBy(c => upright ? c.x0 : c.doctop).ToList();
+                    var cellCorners = bboxToCorners(cell);
 
-                    // Reverse order if necessary
-                    if (!(horizontalLtr && upright || verticalTtb && !upright))
+                    if (currentCells.Count == 0)
                     {
-                        sortedChars.Reverse();
+                        foreach (var corner in cellCorners)
+                            currentCorners.Add(corner);
+                        currentCells.Add(cell);
+                        cellsToRemove.Add(cell);
                     }
-
-                    // Yield the sorted characters
-                    foreach (var character in sortedChars)
+                    else
                     {
-                        yield return character;
+                        int cornerCount = cellCorners.Count(c => currentCorners.Contains(c));
+                        if (cornerCount > 0)
+                        {
+                            foreach (var corner in cellCorners)
+                                currentCorners.Add(corner);
+                            currentCells.Add(cell);
+                            cellsToRemove.Add(cell);
+                        }
                     }
                 }
-            }
-        }
 
-        public IEnumerable<Tuple<Character, List<Character>>> iter_extract_tuples(List<Character> chars)
-        {
-            // Sort characters if necessary
-            var orderedChars = useTextFlow ? chars : iter_sort_chars(chars).ToList();
+                foreach (var cell in cellsToRemove)
+                    remainingCells.Remove(cell);
 
-            // Group characters by "Upright" and any extra attributes
-            var groupedChars = orderedChars
-                .GroupBy(c => new { c.upright, ExtraAttrs = string.Join(",", extraAttrs.Select(attr => attr)) })
-                .ToList();
+                if (currentCells.Count == initialCellCount)
+                {
+                    tables.Add(new List<Rect>(currentCells));
+                    currentCorners.Clear();
+                    currentCells.Clear();
+                }
+            }
 
-            foreach (var group in groupedChars)
+            if (currentCells.Count > 0)
+                tables.Add(currentCells);
+
+            // MuPDF modification: Remove tables without text or having only 1 column
+            for (int i = tables.Count - 1; i >= 0; i--)
             {
-                var charGroup = group.ToList(); // All characters in this group
+                var table = tables[i];
+                var r = new Rect(0, 0, 0, 0);
+                var x1Vals = new HashSet<float>();
+                var x0Vals = new HashSet<float>();
+
+                foreach (var c in table)
+                {
+                    r = r | c;
+                    x1Vals.Add(c.X1);
+                    x0Vals.Add(c.X0);
+                }
+
+                if (x1Vals.Count < 2 || x0Vals.Count < 2)
+                {
+                    tables.RemoveAt(i);
+                    continue;
+                }
 
-                // Assuming we have a method to split characters into words
-                foreach (var wordChars in iter_chars_to_words(charGroup))
+                // Check if table has only whitespace
+                try
                 {
-                    // Yield the word (merged characters and the list of characters)
-                    if (wordChars.Count > 0)
+                    var textpage = TableGlobals.TEXTPAGE ?? page.GetTextPage();
+                    string text = textpage.ExtractTextBox(r.ToFzRect());
+                    if (string.IsNullOrWhiteSpace(text))
                     {
-                        yield return new Tuple<Character, List<Character>>(merge_chars(wordChars), wordChars);
+                        tables.RemoveAt(i);
+                        continue;
                     }
                 }
+                catch
+                {
+                    // If text extraction fails, keep the table
+                }
             }
-        }
-
-        public WordMap extract_wordmap(List<Character> chars)
-        {
-            // Convert the result of IterExtractTuples into a list of tuples and return a WordMap
-            return new WordMap(iter_extract_tuples(chars).ToList());
-        }
 
-        public List<Character> extract_words(List<Character> chars)
-        {
-            // Extract words by iterating over the tuples and selecting the first item (the word)
-            var words = iter_extract_tuples(chars)
-                .Select(tuple => tuple.Item1)  // Select the word (first item in the tuple)
-                .ToList();
+            // Sort tables top-to-bottom-left-to-right
+            tables = tables.OrderBy(t => t.Min(c => Tuple.Create(c.Y0, c.X0))).ToList();
 
-            return words;
+            return tables;
         }
     }
-    
+
+    // CellGroup base class
     public class CellGroup
     {
-        public List<BBox> Cells { get; set; }  // List of tuples representing the bounding boxes
-
-        public BBox Bbox { get; set; }
+        public List<Rect> cells { get; set; }
+        public Rect bbox { get; set; }
 
-        public CellGroup(List<BBox> cells)
+        public CellGroup(List<Rect> cells)
         {
-            Cells = cells;
-
-            // Filter out null cells and then calculate the bounding box (bbox)
-            var filteredCells = cells.Where(cell => cell != null).ToList();
-
-            // Calculate the bounding box using LINQ (equivalent to min/max in Python)
-            Bbox = new BBox(
-                filteredCells.Min(cell => cell.x0),  // min x0
-                filteredCells.Min(cell => cell.top),  // min top
-                filteredCells.Max(cell => cell.x1),  // max x1
-                filteredCells.Max(cell => cell.bottom)   // max bottom
-            );
+            this.cells = cells;
+            if (cells != null && cells.Count > 0)
+            {
+                var validCells = cells.Where(c => c != null).ToList();
+                if (validCells.Count > 0)
+                {
+                    bbox = new Rect(
+                        validCells.Min(c => c.X0),
+                        validCells.Min(c => c.Y0),
+                        validCells.Max(c => c.X1),
+                        validCells.Max(c => c.Y1)
+                    );
+                }
+            }
         }
     }
 
+    // TableRow class
     public class TableRow : CellGroup
     {
-        // Inherits everything from CellGroup and does not add any new behavior yet.
-        public TableRow(List<BBox> cells) : base(cells)
+        public TableRow(List<Rect> cells) : base(cells)
         {
         }
     }
 
+    // TableHeader class
     public class TableHeader
     {
-        // Properties to hold the bounding box, cells, names, and above (external)
-        public BBox Bbox { get; set; }
-        public List<BBox> Cells { get; set; }
-        public List<string> Names { get; set; }
-        public bool External { get; set; }  // Use 'object' if 'above' can be of different types
+        public Rect bbox { get; set; }
+        public List<Rect> cells { get; set; }
+        public List<string> names { get; set; }
+        public bool external { get; set; }
 
-        // Constructor
-        public TableHeader(BBox bbox, List<BBox> cells, List<string> names, bool above)
+        public TableHeader(Rect bbox, List<Rect> cells, List<string> names, bool external)
         {
-            Bbox = bbox;
-            Cells = cells;
-            Names = names;
-            External = above;
+            this.bbox = bbox;
+            this.cells = cells;
+            this.names = names;
+            this.external = external;
         }
     }
 
+    // Table class
     public class Table
     {
-        public Page Page { get; set; }  // Represents the page object in your document
-        public List<BBox> Cells { get; set; }
-        public TableHeader Header { get; set; }
-        public List<Character> Chars { get; set; }
+        public Page page { get; set; }
+        public TextPage textpage { get; set; }
+        public List<Rect> cells { get; set; }
+        public TableHeader header { get; set; }
 
-        public Table(Page page, List<BBox> cells, List<Character> chars)
+        public Table(Page page, List<Rect> cells)
         {
-            this.Page = page;
-            this.Cells = cells;
-            this.Chars = chars;
-            this.Header = _get_header();
+            this.page = page;
+            this.cells = cells;
+            this.textpage = null;
+            this.header = GetHeader();
         }
 
-        public BBox Bbox
+        public Rect bbox
         {
             get
             {
-                var c = this.Cells;
-                return new BBox(
-                    c.Min(cell => cell.x0),
-                    c.Min(cell => cell.top),
-                    c.Max(cell => cell.x1),
-                    c.Max(cell => cell.bottom)
+                if (cells == null || cells.Count == 0)
+                    return null;
+                return new Rect(
+                    cells.Min(c => c.X0),
+                    cells.Min(c => c.Y0),
+                    cells.Max(c => c.X1),
+                    cells.Max(c => c.Y1)
                 );
             }
         }
 
-        public List<TableRow> Rows
+        public List<TableRow> rows
         {
             get
             {
-                var sorted = this.Cells.OrderBy(cell => cell.top).ThenBy(cell => cell.x0).ToList();
-                var xCoordinates = sorted.Select(cell => cell.x0).Distinct().OrderBy(x => x).ToList();
+                var sorted = cells.OrderBy(c => c.Y0).ThenBy(c => c.X0).ToList();
+                var xs = cells.Select(c => c.X0).Distinct().OrderBy(x => x).ToList();
                 var rows = new List<TableRow>();
 
-                foreach (var group in sorted.GroupBy(cell => cell.top))
+                foreach (var group in sorted.GroupBy(c => c.Y0))
                 {
-                    var rowCells = group.ToDictionary(cell => cell.x0, cell => cell);
-                    var row = new TableRow(rowCells.Values.ToList());
+                    var rowCells = group.OrderBy(c => c.X0).ToList();
+                    var xdict = rowCells.ToDictionary(c => c.X0, c => c);
+                    var row = new TableRow(xs.Select(x => xdict.ContainsKey(x) ? xdict[x] : null).ToList());
                     rows.Add(row);
                 }
 
@@ -2009,416 +2298,448 @@ public List<TableRow> Rows
             }
         }
 
-        public int RowCount => Rows.Count;
-        public int ColCount => Rows.Max(row => row.Cells.Count);
+        public int row_count
+        {
+            get { return rows.Count; }
+        }
+
+        public int col_count
+        {
+            get { return rows.Count > 0 ? rows.Max(r => r.cells.Count) : 0; }
+        }
 
         public List<List<string>> Extract(Dictionary<string, object> kwargs = null)
         {
-            var chars = Chars;  // Placeholder for actual char extraction logic
+            if (kwargs == null)
+                kwargs = new Dictionary<string, object>();
+
+            var chars = TableGlobals.CHARS;
             var tableArr = new List<List<string>>();
 
-            bool char_in_bbox(Character character, BBox bbox)
+            bool CharInBbox(CharDict char_, Rect bbox)
             {
-                // Calculate the vertical and horizontal midpoints of the character's bounding box
-                float vMid = (character.top + character.bottom) / 2;
-                float hMid = (character.x0 + character.x1) / 2;
-
-                // Get the coordinates from the bounding box
-                float x0 = bbox.x0;
-                float top = bbox.top;
-                float x1 = bbox.x1;
-                float bottom = bbox.bottom;
-
-                // Check if the character's midpoint is within the bounding box
-                return (hMid >= x0 && hMid < x1 && vMid >= top && vMid < bottom);
+                float v_mid = (char_.top + char_.bottom) / 2;
+                float h_mid = (char_.x0 + char_.x1) / 2;
+                return h_mid >= bbox.X0 && h_mid < bbox.X1 && v_mid >= bbox.Y0 && v_mid < bbox.Y1;
             }
 
-            foreach (var row in Rows)
+            foreach (var row in rows)
             {
-                var rowArr = new List<string>();
-                var rowChars = chars.Where(c => char_in_bbox(c, row.Bbox)).ToList();
+                var arr = new List<string>();
+                var rowChars = chars.Where(c => CharInBbox(c, row.bbox)).ToList();
 
-                foreach (BBox cell in row.Cells)
+                foreach (var cell in row.cells)
                 {
-                    string cellText = string.Empty;
-                    if (cell != null)
+                    if (cell == null)
+                    {
+                        arr.Add(null);
+                    }
+                    else
                     {
-                        var cellChars = rowChars.Where(c => char_in_bbox(c, cell)).ToList();
-                        if (cellChars.Any())
+                        var cellChars = rowChars.Where(c => CharInBbox(c, cell)).ToList();
+                        if (cellChars.Count > 0)
                         {
-                            if (kwargs == null)
-                            {
-                                kwargs = new Dictionary<string, object>();
-                            }
-                            kwargs["x_shift"] = cell.x0;
-                            kwargs["y_shift"] = cell.top;
-
-                            // Check if "layout" is in kwargs and update layout_width and layout_height accordingly
-                            if (kwargs.ContainsKey("layout"))
+                            var cellKwargs = new Dictionary<string, object>(kwargs);
+                            cellKwargs["x_shift"] = cell.X0;
+                            cellKwargs["y_shift"] = cell.Y0;
+                            if (cellKwargs.ContainsKey("layout"))
                             {
-                                kwargs["layout_width"] = cell.x1 - cell.x0;
-                                kwargs["layout_height"] = cell.bottom - cell.top;
+                                cellKwargs["layout_width"] = cell.X1 - cell.X0;
+                                cellKwargs["layout_height"] = cell.Y1 - cell.Y0;
                             }
-                            // Call your text extraction logic here
-                            cellText = extract_text(cellChars, kwargs);
+                            var cellText = ExtractText(cellChars, cellKwargs);
+                            arr.Add(cellText);
                         }
                         else
                         {
-                            cellText = string.Empty;
+                            arr.Add("");
                         }
                     }
-                    rowArr.Add(cellText);
                 }
-                tableArr.Add(rowArr);
+                tableArr.Add(arr);
             }
 
             return tableArr;
         }
 
-        // Output table content as a string in Github-markdown format.
-        // If clean is true, markdown syntax is removed from cell content.
-        public string ToMarkdown(bool clean = true)
+        private string ExtractText(List<CharDict> chars, Dictionary<string, object> kwargs)
+        {
+            return TextExtractionHelpers.ExtractText(chars, kwargs);
+        }
+
+        public string ToMarkdown(bool clean = false, bool fillEmpty = true)
         {
-            StringBuilder output = new StringBuilder("|");
+            var output = new StringBuilder();
+            output.Append("|");  // Start with "|" as in Python line 1604
+            int rows = row_count;
+            int cols = col_count;
 
-            // Generate header string and MD underline
-            for (int i = 0; i < Header.Names.Count; i++)
+            // cell coordinates
+            var cellBoxes = this.rows.Select(r => r.cells.ToList()).ToList();
+
+            // cell text strings
+            var cells = new List<List<string>>();
+            for (int i = 0; i < rows; i++)
             {
-                string name = Header.Names[i];
-                if (string.IsNullOrEmpty(name))  // Generate a name if empty
+                cells.Add(new List<string>());
+                for (int colIdx = 0; colIdx < cols; colIdx++)
                 {
-                    name = $"Col{i + 1}";
+                    cells[i].Add(null);
                 }
+            }
 
-                name = name.Replace("\n", " ");  // Remove any line breaks
+            for (int i = 0; i < cellBoxes.Count; i++)
+            {
+                for (int colIdx = 0; colIdx < cellBoxes[i].Count && colIdx < cols; colIdx++)
+                {
+                    if (cellBoxes[i][colIdx] != null)
+                    {
+                        cells[i][colIdx] = TableHelpers.ExtractCells(textpage, cellBoxes[i][colIdx], markdown: true);
+                    }
+                }
+            }
 
-                if (clean)  // Remove sensitive syntax
+            if (fillEmpty)
+            {
+                // for rows, copy content from left to right
+                for (int rowIdx = 0; rowIdx < rows; rowIdx++)
                 {
-                    name = WebUtility.HtmlEncode(name.Replace("-", "&#45;"));
+                    for (int i = 0; i < cols - 1; i++)
+                    {
+                        if (cells[rowIdx][i + 1] == null)
+                        {
+                            cells[rowIdx][i + 1] = cells[rowIdx][i];
+                        }
+                    }
                 }
 
-                output.Append(name + "|");
+                // for columns, copy top to bottom
+                for (int i = 0; i < cols; i++)
+                {
+                    for (int rowIdx = 0; rowIdx < rows - 1; rowIdx++)
+                    {
+                        if (cells[rowIdx + 1][i] == null)
+                        {
+                            cells[rowIdx + 1][i] = cells[rowIdx][i];
+                        }
+                    }
+                }
             }
 
-            output.Append("\n");
-
-            // Generate the markdown header line
-            for (int i = 0; i < ColCount; i++)
+            // generate header string and MD separator
+            // Note: Python assumes self.header always exists, so we do the same
+            for (int i = 0; i < header.names.Count; i++)
             {
-                output.Append("---|");
+                string name = header.names[i];
+                if (string.IsNullOrEmpty(name))
+                {
+                    name = $"Col{i + 1}";
+                }
+                name = name.Replace("\n", "<br>");
+                if (clean)
+                {
+                    name = System.Security.SecurityElement.Escape(name.Replace("-", "&#45;"));
+                }
+                output.Append(name + "|");
             }
             output.Append("\n");
+            // insert GitHub header line separator
+            output.Append("|" + string.Join("|", Enumerable.Range(0, col_count).Select(_ => "---")) + "|\n");
 
-            // Skip first row in details if header is part of the table
-            int j = (Header.External ? 0 : 1);
+            // skip first row in details if header is part of the table
+            int startRow = header.external ? 0 : 1;
 
-            // Iterate over detail rows
-            var rows = Extract();  // Assuming Extract() is a method that returns a List<List<string>>
-            foreach (var row in rows.GetRange(j, rows.Count - j))
+            // iterate over detail rows
+            for (int i = startRow; i < rows; i++)
             {
-                string line = "|";
-                foreach (var cell in row)
+                output.Append("|");
+                for (int k = 0; k < cols; k++)
                 {
-                    // Output null cells with empty string
-                    string cellContent = cell ?? "";
-                    cellContent = cellContent.Replace("\n", " ");  // Remove line breaks
-                    if (clean)  // Remove sensitive syntax
+                    string cell = cells[i][k];
+                    if (cell == null)
+                        cell = "";
+                    if (clean)
                     {
-                        cellContent = WebUtility.HtmlEncode(cellContent.Replace("-", "&#45;"));
+                        cell = System.Security.SecurityElement.Escape(cell.Replace("-", "&#45;"));
                     }
-                    line += cellContent + "|";
+                    output.Append(cell + "|");
                 }
-                line += "\n";
-                output.Append(line);
+                output.Append("\n");
             }
-
             return output.ToString() + "\n";
         }
-        
-        // Identify the table header.
-        // *** PyMuPDF extension. ***
-        // Starting from the first line above the table upwards, check if it
-        // qualifies to be part of the table header.
-        // Criteria include:
-        // * A one-line table never has an extra header.
-        // * Column borders must not intersect any word. If this happens, all
-        //   text of this line and above of it is ignored.
-        // * No excess inter-line distance: If a line further up has a distance
-        //   of more than 1.5 times of its font size, it will be ignored and
-        //   all lines above of it.
-        // * Must have same text properties.
-        // * Starting with the top table line, a bold text property cannot change
-        // back to non-bold.
-        // If not all criteria are met (or there is no text above the table),
-        // the first table row is assumed to be the header.
-        private TableHeader _get_header(int yTolerance = 3)
-        {
-            // Check if row 0 has bold text anywhere.
-            // If this is true, then any non - bold text in lines above disqualify
-            // these lines as header.
-            // bbox is the(potentially repaired) row 0 bbox.
-            // Returns True or False
-            bool top_row_is_bold(BBox _bbox)
-            {
-                List<Block> blocks = Page.GetText("dict", clip: new Rect(_bbox.x0, _bbox.top, _bbox.x1, _bbox.bottom),
-                    flags: (int)TextFlagsExtension.TEXTFLAGS_TEXT).Blocks;
-                foreach (Block block in blocks)
-                {
-                    foreach (Line line in block.Lines)
-                    {
-                        foreach (Span span in line.Spans)
-                        {
-                            if (((int)span.Flags & 16) != 0)
-                            {
-                                return true;
-                            }
-                        }
-                    }
+
+        // to_pandas - Return a pandas DataFrame version of the table
+        // Note: This would require the pandas.NET library or similar
+        // For C#, users can convert the Extract() result to their preferred data structure
+        public object ToPandas(Dictionary<string, object> kwargs = null)
+        {
+            // In Python: returns pandas.DataFrame
+            // In C#: Could return DataTable, or users can use Extract() and convert manually
+            throw new NotImplementedException("ToPandas is not implemented in C#. Use Extract() and convert to your preferred data structure (e.g., DataTable).");
+        }
+
+        private string ExtractCells(TextPage textpage, Rect cell, bool markdown = false)
+        {
+            return TableHelpers.ExtractCells(textpage, cell, markdown);
+        }
+
+        private TableHeader GetHeader(float yTolerance = 3.0f)
+        {
+            float yDelta = yTolerance;
+
+            // Helper function: Check if top row has different background color
+            bool TopRowBgColor()
+            {
+                try
+                {
+                    var bbox0 = rows[0].bbox;
+                    var bboxt = new Rect(bbox0.X0, bbox0.Y0 - bbox0.Height, bbox0.X1, bbox0.Y0);
+                    var (_, topColor0) = page.GetPixmap(clip: bbox0).ColorTopUsage();
+                    var (_, topColort) = page.GetPixmap(clip: bboxt).ColorTopUsage();
+                    return !topColor0.SequenceEqual(topColort);
+                }
+                catch
+                {
+                    return false;
                 }
-                return false;
             }
 
-            if (Rows.Count == 0)
+            // Helper function: Check if row contains bold text
+            bool RowHasBold(Rect rowBbox)
             {
-                return null;
+                return TableGlobals.CHARS.Any(c =>
+                    TableHelpers.RectInRect(new Rect(c.x0, c.y0, c.x1, c.y1), rowBbox) && c.bold);
             }
 
-            var row = Rows[0];
-            var cells = row.Cells;
-            var bbox = new BBox(row.Bbox.x0, row.Bbox.top, row.Bbox.x1, row.Bbox.bottom);
+            if (rows == null || rows.Count == 0)
+                return null;
 
-            TableHeader headerTopRow = new TableHeader(bbox, cells, Extract()[0], false);
+            var row = rows[0];
+            var cells = row.cells;
+            var bbox = row.bbox;
+
+            // Return this if we determine that the top row is the header
+            var extractResult = Extract();
+            var headerTopRow = new TableHeader(
+                bbox,
+                cells,
+                extractResult.Count > 0 ? extractResult[0] : new List<string>(),
+                false
+            );
 
-            // One-line tables have no extra header
-            if (Rows.Count < 2)
+            // 1-line tables have no extra header
+            if (rows.Count < 2)
                 return headerTopRow;
 
+            // 1-column tables have no extra header
             if (cells.Count < 2)
                 return headerTopRow;
 
-            // column (x) coordinates
-            var colX = new List<float>();
-            foreach (var cell in cells.Take(cells.Count - 1))
-            {
-                if (cell != null)
-                {
-                    colX.Add(cell.x1); // Assuming X1 is the right edge of the cell
-                }
-            }
+            // Assume top row is the header if second row is empty
+            var row2 = rows[1];
+            if (row2.cells.All(c => c == null))
+                return headerTopRow;
 
             // Special check: is top row bold?
-            // If first line above table is not bold, but top-left table cell is bold,
-            // we take first table row as header
-            bool topRowBold = top_row_is_bold(bbox);
+            bool topRowBold = RowHasBold(bbox);
+
+            // Assume top row is header if it is bold and any cell of 2nd row is non-bold
+            if (topRowBold && !RowHasBold(row2.bbox))
+                return headerTopRow;
+
+            if (TopRowBgColor())
+                return headerTopRow;
+
+            // Column coordinates (x1 values) in top row
+            var colX = cells.Take(cells.Count - 1).Select(c => c != null ? c.X1 : (float?)null).ToList();
 
-            // clip = area above table
-            // We will inspect this area for text qualifying as column header.
-            BBox clip = new BBox(bbox.x0, bbox.top, bbox.x1, bbox.bottom);
-            clip.top = 0; // Start at the top of the page
-            clip.bottom = bbox.top; // End at the top of the table
+            // Clip = page area above the table
+            var clip = new Rect(bbox.X0, 0, bbox.X1, bbox.Y0);
 
-            var spans = new List<Span>();
-            List<Block> clipBlocks = Page.GetText("dict", clip:new Rect(clip.x0, clip.top, clip.x1, clip.bottom), flags: (int)TextFlagsExtension.TEXTFLAGS_TEXT).Blocks;
-            foreach (Block block in clipBlocks)
+            // Get text blocks above table
+            dynamic pageInfo = page.GetText("dict", clip: clip, flags: (int)TextFlagsExtension.TEXTFLAGS_TEXT);
+            List<Block> blocks = pageInfo?.BLOCKS ?? new List<Block>();
+
+            // Non-empty, non-superscript spans above table, sorted descending by y1
+            var spans = new List<Dictionary<string, object>>();
+            foreach (var block in blocks)
             {
-                foreach (Line line in block.Lines)
+                if (block.Lines == null) continue;
+                foreach (var line in block.Lines)
                 {
-                    foreach (Span span in line.Spans)
+                    if (line.Spans == null) continue;
+                    foreach (var span in line.Spans)
                     {
-                        int flag = (int)span.Flags;
-                        if ((flag & 1) == 0 && !string.IsNullOrWhiteSpace(span.Text))
-                        {  // ignore superscripts and empty text
-                            spans.Add(span);
+                        if (span.Bbox == null) continue;
+                        string text = span.Text ?? "";
+                        bool isWhitespace = text.All(c => TableGlobals.WHITE_SPACES.Contains(c));
+                        bool isSuperscript = ((int)span.Flags & (int)FontStyle.TEXT_FONT_SUPERSCRIPT) != 0;
+                        
+                        if (!isWhitespace && !isSuperscript)
+                        {
+                            spans.Add(new Dictionary<string, object>
+                            {
+                                { "text", text },
+                                { "bbox", new List<object> { span.Bbox.X0, span.Bbox.Y0, span.Bbox.X1, span.Bbox.Y1 } },
+                                { "flags", span.Flags }
+                            });
                         }
                     }
                 }
             }
 
-            var select = new List<float>(); // y1 coordinates above, sorted descending
-            var lineHeights = new List<float>();    // line heights above, sorted descending
-            var lineBolds = new List<bool>();   // bold indicator per line above, same sorting
+            spans = spans.OrderByDescending(s => ((List<object>)s["bbox"])[3]).ToList();
 
-            // spans sorted descending
-            spans = spans.OrderByDescending(s => s.Bbox.Y1).ToList();
+            var select = new List<float>();
+            var lineHeights = new List<float>();
+            var lineBolds = new List<bool>();
 
-            // walk through the spans and fill above 3 lists
+            // Walk through spans and fill the 3 lists
             for (int i = 0; i < spans.Count; i++)
             {
-                Span span = spans[i];
-                float y1 = span.Bbox.Y1;  // span bottom
-                float height = y1 - span.Bbox.Y0; // span bbox height
-                bool bold = ((int)span.Flags & 16) != 0;
+                var s = spans[i];
+                var sbbox = s["bbox"] as List<object>;
+                if (sbbox == null || sbbox.Count < 4) continue;
+
+                float y1 = Convert.ToSingle(sbbox[3]);
+                float h = y1 - Convert.ToSingle(sbbox[1]);
+                bool bold = ((int)s["flags"] & (int)FontStyle.TEXT_FONT_BOLD) != 0;
 
-                // use first item to start the lists
                 if (i == 0)
                 {
                     select.Add(y1);
-                    lineHeights.Add(height);
+                    lineHeights.Add(h);
                     lineBolds.Add(bold);
                     continue;
                 }
 
-                // get last items from the 3 lists
-                float y0 = select.Last();
-                float prevHeight = lineHeights.Last();
-                bool prevBold = lineBolds.Last();
+                float y0 = select[select.Count - 1];
+                float h0 = lineHeights[lineHeights.Count - 1];
+                bool bold0 = lineBolds[lineBolds.Count - 1];
 
-                if (prevBold && !bold)
-                    break;  // stop if switching from bold to non-bold
+                if (bold0 && !bold)
+                    break;
 
-                // if fitting in height of previous span, modify bbox
-                if (y0 - y1 <= yTolerance || Math.Abs((y0 - prevHeight) - span.Bbox.Y0) <= yTolerance)
+                if (y0 - y1 <= yDelta || Math.Abs((y0 - h0) - Convert.ToSingle(sbbox[1])) <= yDelta)
                 {
-                    span.Bbox = new Rect(span.Bbox.X0, y0 - prevHeight, span.Bbox.X1, y0);
-                    spans[i] = span;
+                    sbbox[1] = y0 - h0;
+                    sbbox[3] = y0;
+                    s["bbox"] = sbbox;
+                    spans[i] = s;
                     if (bold)
                         lineBolds[lineBolds.Count - 1] = bold;
                     continue;
                 }
-                else if (y0 - y1 > 1.5 * prevHeight)
+                else if (y0 - y1 > 1.5 * h0)
                 {
-                    break;  // stop if distance to previous line too large
+                    break;
                 }
 
                 select.Add(y1);
-                lineHeights.Add(height);
+                lineHeights.Add(h);
                 lineBolds.Add(bold);
             }
 
-            if (!select.Any())  // nothing above the table?
+            if (select.Count == 0)
                 return headerTopRow;
 
-            select = select.Take(5).ToList(); // Only accept up to 5 lines in any header
+            select = select.Take(5).ToList();
 
-            // take top row as header if text above table is too far apart
-            if (bbox.top - select.First() >= lineHeights.First())
+            // Assume top row as header if text above is too far away
+            if (bbox.Y0 - select[0] >= lineHeights[0])
                 return headerTopRow;
 
-            // If top row is bold but line above is not, return top row as header
-            if (topRowBold && !lineBolds.First())
+            // Accept top row as header if bold, but line above is not
+            if (topRowBold && !lineBolds[0])
                 return headerTopRow;
 
-            if (!spans.Any())   // nothing left above the table, return top row
+            if (spans.Count == 0)
                 return headerTopRow;
 
             // Re-compute clip above table
-            BBox nclip = new BBox(0,0,0,0);
-            foreach (var span in spans.Where(s => s.Bbox.Y1 >= select.Last()))
+            var nclip = new Rect(0, 0, 0, 0);
+            foreach (var s in spans.Where(s => Convert.ToSingle(((List<object>)s["bbox"])[3]) >= select[select.Count - 1]))
             {
-                nclip = nclip.Union(new BBox(span.Bbox.X0, span.Bbox.Y0, span.Bbox.X1, span.Bbox.Y1));
+                var sbbox = s["bbox"] as List<object>;
+                if (sbbox != null && sbbox.Count >= 4)
+                {
+                    var srect = new Rect(
+                        Convert.ToSingle(sbbox[0]),
+                        Convert.ToSingle(sbbox[1]),
+                        Convert.ToSingle(sbbox[2]),
+                        Convert.ToSingle(sbbox[3])
+                    );
+                    nclip = nclip | srect;
+                }
             }
 
-            if (!nclip.IsEmpty())
+            if (!nclip.IsEmpty)
                 clip = nclip;
 
-            clip.bottom = bbox.bottom;  // make sure we still include every word above
+            clip.Y1 = bbox.Y0;
 
             // Confirm that no word in clip is intersecting a column separator
-            List<WordBlock> clipWords = Page.GetTextWords(clip: new Rect(clip.x0, clip.top, clip.x1, clip.bottom));
-            List<BBox> wordRects = clipWords.Select(w => new BBox(w.X0, w.Y0, w.X1, w.Y1)).ToList();
-            List<float> wordTops = wordRects.Select(r => r.top).Distinct().OrderByDescending(top => top).ToList();
+            // Get words from textpage or page
+            var textpageForWords = page.GetTextPage(clip: clip);
+            var words = textpageForWords.ExtractWords();
+            var wordRects = words.Select(w => new Rect(w.X0, w.Y0, w.X1, w.Y1)).ToList();
+            var wordTops = wordRects.Select(r => r.Y0).Distinct().OrderByDescending(y => y).ToList();
 
-            List<float> wordSelect = new List<float>();
+            select.Clear();
 
+            // Exclude lines with words that intersect a column border
             foreach (var top in wordTops)
             {
-                bool intersecting = false;
-                foreach (var x in colX)
-                {
-                    if (x >= 0f)
-                    {
-                        foreach (var r in wordRects)
-                        {
-                            // Check if word intersects a column border
-                            if (r.top == top && r.x0 < x && r.x1 > x)
-                            {
-                                intersecting = true;
-                                break;
-                            }
-                        }
-                    }
-                    if (intersecting)
-                    {
-                        break;
-                    }
-                }
+                bool hasIntersecting = colX.Any(x =>
+                    x.HasValue && wordRects.Any(r => r.Y0 == top && r.X0 < x.Value && r.X1 > x.Value));
 
-                if (!intersecting)
+                if (!hasIntersecting)
                 {
-                    wordSelect.Add(top);
+                    select.Add(top);
                 }
                 else
                 {
-                    // Detected a word crossing a column border
                     break;
                 }
             }
 
-            if (wordSelect.Count == 0) // nothing left over: return first row
+            if (select.Count == 0)
                 return headerTopRow;
 
-            BBox hdrBbox = clip;  // compute the header cells
-            hdrBbox.top = wordSelect.Last();  // hdr_bbox.top is the smallest top coordinate of words
+            var hdrBbox = new Rect(clip.X0, select[select.Count - 1], clip.X1, clip.Y1);
+            hdrBbox.X0 = this.bbox.X0;
+            hdrBbox.X1 = this.bbox.X1;
 
-            List<BBox> hdrCells = new List<BBox>();
-            foreach (var c in cells)
+            var hdrCells = cells.Select(c =>
+                c != null ? new Rect(c.X0, hdrBbox.Y0, c.X1, hdrBbox.Y1) : (Rect)null
+            ).ToList();
+
+            // Column names: no line breaks, no excess spaces
+            var hdrNames = hdrCells.Select(c =>
             {
-                if (c != null)
+                if (c == null) return "";
+                try
                 {
-                    hdrCells.Add(new BBox(c.x0, hdrBbox.top, c.x1, hdrBbox.bottom));
+                    return page.GetTextbox(c).Replace("\n", " ").Replace("  ", " ").Trim();
                 }
-                else
+                catch
                 {
-                    hdrCells.Add(null);
+                    return "";
                 }
-            }
-
-            // adjust left/right of header bbox
-            hdrBbox.x0 = Bbox.x0;
-            hdrBbox.x1 = Bbox.x1;
-
-            // List to store the processed header names
-            List<string> hdrNames = new List<string>();
-
-            // Process each header cell
-            foreach (var c in hdrCells)
-            {
-                string cText = Page.GetTextbox(new Rect(c.x0, c.top, c.x1, c.bottom));
-                string name = c != null ? cText.Replace("\n", " ").Replace("  ", " ").Trim() : "";
-                hdrNames.Add(name);
-            }
+            }).ToList();
 
             return new TableHeader(hdrBbox, hdrCells, hdrNames, true);
         }
-
-        private string ExtractText(List<Dictionary<string, object>> cellChars, Dictionary<string, object> kwargs)
-        {
-            // Logic to extract text from characters inside a bounding box
-            // Placeholder logic
-            return string.Join(" ", cellChars.Select(c => c["text"].ToString()));
-        }
     }
-    public class TableSettings 
-    {
-        static readonly string[] NON_NEGATIVE_SETTINGS = {
-            "snap_tolerance",
-            "snap_x_tolerance",
-            "snap_y_tolerance",
-            "join_tolerance",
-            "join_x_tolerance",
-            "join_y_tolerance",
-            "edge_min_length",
-            "min_words_vertical",
-            "min_words_horizontal",
-            "intersection_tolerance",
-            "intersection_x_tolerance",
-            "intersection_y_tolerance",
-        };
 
+    // TableSettings class
+    public class TableSettings
+    {
         public string vertical_strategy { get; set; } = "lines";
         public string horizontal_strategy { get; set; } = "lines";
-        public List<Edge> explicit_vertical_lines { get; set; } = null;
-        public List<Edge> explicit_horizontal_lines { get; set; } = null;
+        public List<object> explicit_vertical_lines { get; set; } = null;
+        public List<object> explicit_horizontal_lines { get; set; } = null;
         public float snap_tolerance { get; set; } = TableFlags.TABLE_DEFAULT_SNAP_TOLERANCE;
         public float snap_x_tolerance { get; set; } = TableFlags.TABLE_UNSET;
         public float snap_y_tolerance { get; set; } = TableFlags.TABLE_UNSET;
@@ -2435,119 +2756,79 @@ public class TableSettings
 
         public TableSettings PostInit()
         {
-            // Clean up user-provided table settings.
-            // Validates that the table settings provided consists of acceptable values and
-            // returns a cleaned up version.The cleaned up version fills out the missing
-            // values with the default values in the provided settings.
-            // TODO: Can be further used to validate that the values are of the correct
-            // type.For example, raising a value error when a non-boolean input is
-            // provided for the key ``keep_blank_chars``.
-            // :param table_settings: User - provided table settings.
-            // :returns: A cleaned up version of the user - provided table settings.
-            // :raises ValueError: When an unrecognised key is provided.
-
-            foreach (string setting in NON_NEGATIVE_SETTINGS)
+            // Validate non-negative settings
+            var nonNegativeSettings = new[]
             {
-                PropertyInfo property = typeof(TableSettings).GetProperty(setting);
-                if (property != null)
-                {
-                    var value = property.GetValue(this);
-                    if ((float)value < 0)
-                    {
-                        throw new ArgumentException("Table setting " + setting + " cannot be negative");
-                    }
-                }
-                else
-                {
-                    throw new ArgumentException("Table setting not include property " + setting);
-                }
-            }
+                "snap_tolerance", "snap_x_tolerance", "snap_y_tolerance",
+                "join_tolerance", "join_x_tolerance", "join_y_tolerance",
+                "edge_min_length", "min_words_vertical", "min_words_horizontal",
+                "intersection_tolerance", "intersection_x_tolerance", "intersection_y_tolerance"
+            };
 
-            foreach (string orientation in new string[] { "horizontal", "vertical" })
+            foreach (var setting in nonNegativeSettings)
             {
-                PropertyInfo property = typeof(TableSettings).GetProperty(orientation + "_strategy");
-                if (property != null)
-                {
-                    var strategy = property.GetValue(this);
-                    if (Array.IndexOf(TableFlags.TABLE_STRATEGIES, strategy) == -1)
-                    {
-                        throw new ArgumentException(orientation + "_strategy  must be one of " + string.Join(",", TableFlags.TABLE_STRATEGIES));
-                    }
-                }
-                else
+                var value = (float)GetType().GetProperty(setting).GetValue(this);
+                if (value < 0)
                 {
-                    throw new ArgumentException("Table setting not include property " + orientation + "_strategy");
+                    throw new ArgumentException($"Table setting '{setting}' cannot be negative");
                 }
             }
 
-            if (this.text_settings == null)
-                this.text_settings = new Dictionary<string, object>();
-
-            // This next section is for backwards compatibility
-            foreach (string attr in new string[] { "x_tolerance", "y_tolerance" })
+            // Validate strategies
+            if (!TableFlags.TABLE_STRATEGIES.Contains(vertical_strategy))
             {
-                if (!this.text_settings.ContainsKey(attr))
-                {
-                    this.text_settings[attr] = this.text_settings.ContainsKey("tolerance") ? this.text_settings["tolerance"] : 3.0f;
-                }
+                throw new ArgumentException($"vertical_strategy must be one of {{{string.Join(",", TableFlags.TABLE_STRATEGIES)}}}");
             }
 
-            if (this.text_settings.ContainsKey("tolerance"))
+            if (!TableFlags.TABLE_STRATEGIES.Contains(horizontal_strategy))
             {
-                this.text_settings.Remove("tolerance");
+                throw new ArgumentException($"horizontal_strategy must be one of {{{string.Join(",", TableFlags.TABLE_STRATEGIES)}}}");
             }
-            // End of that section
 
-            var mappings = new (string attr, string fallback)[]
-            {
-                ("snap_x_tolerance", "snap_tolerance"),
-                ("snap_y_tolerance", "snap_tolerance"),
-                ("join_x_tolerance", "join_tolerance"),
-                ("join_y_tolerance", "join_tolerance"),
-                ("intersection_x_tolerance", "intersection_tolerance"),
-                ("intersection_y_tolerance", "intersection_tolerance")
-            };
-            foreach (var (attr, fallback) in mappings)
+            if (text_settings == null)
             {
-                // Get the property info for the current attribute and fallback
-                PropertyInfo attrProperty = typeof(TableSettings).GetProperty(attr);
-                PropertyInfo fallbackProperty = typeof(TableSettings).GetProperty(fallback);
-
-                if (attrProperty != null && fallbackProperty != null)
-                {
-                    float attrValue = (float)attrProperty.GetValue(this);
-                    if (attrValue == TableFlags.TABLE_UNSET)
-                    {
-                        float fallbackValue = (float)fallbackProperty.GetValue(this);
-                        attrProperty.SetValue(this, fallbackValue);
-                    }
-                }
+                text_settings = new Dictionary<string, object>();
             }
 
+            // Set defaults for unset tolerances
+            if (snap_x_tolerance == TableFlags.TABLE_UNSET)
+                snap_x_tolerance = snap_tolerance;
+            if (snap_y_tolerance == TableFlags.TABLE_UNSET)
+                snap_y_tolerance = snap_tolerance;
+            if (join_x_tolerance == TableFlags.TABLE_UNSET)
+                join_x_tolerance = join_tolerance;
+            if (join_y_tolerance == TableFlags.TABLE_UNSET)
+                join_y_tolerance = join_tolerance;
+            if (intersection_x_tolerance == TableFlags.TABLE_UNSET)
+                intersection_x_tolerance = intersection_tolerance;
+            if (intersection_y_tolerance == TableFlags.TABLE_UNSET)
+                intersection_y_tolerance = intersection_tolerance;
+
             return this;
         }
 
-        public static TableSettings resolve(object settings = null)
+        public static TableSettings Resolve(object settings = null)
         {
             if (settings == null)
             {
-                return new TableSettings();
+                return new TableSettings().PostInit();
             }
-            else if (settings is TableSettings tableSettings)
+
+            if (settings is TableSettings ts)
             {
-                return tableSettings;
+                return ts.PostInit();
             }
-            else if (settings is Dictionary<string, object> settingsDict)
+
+            if (settings is Dictionary<string, object> dict)
             {
                 var coreSettings = new Dictionary<string, object>();
                 var textSettings = new Dictionary<string, object>();
 
-                // Loop over the dictionary and separate text_ settings
-                foreach (var kvp in settingsDict)
+                foreach (var kvp in dict)
                 {
                     if (kvp.Key.StartsWith("text_"))
                     {
-                        textSettings[kvp.Key.Substring(5)] = kvp.Value.ToString();
+                        textSettings[kvp.Key.Substring(5)] = kvp.Value;
                     }
                     else
                     {
@@ -2555,669 +2836,834 @@ public static TableSettings resolve(object settings = null)
                     }
                 }
 
-                // Add textSettings to coreSettings before passing to the constructor
                 coreSettings["text_settings"] = textSettings;
 
-                var instance = new TableSettings();
+                var tableSettings = new TableSettings();
                 foreach (var kvp in coreSettings)
                 {
-                    var property = instance.GetType().GetProperty(kvp.Key);
-
-                    if (property != null)
+                    var prop = typeof(TableSettings).GetProperty(kvp.Key);
+                    if (prop != null && prop.CanWrite)
                     {
-                        property.SetValue(instance, kvp.Value);
+                        prop.SetValue(tableSettings, kvp.Value);
                     }
-                    else
+                }
+
+                return tableSettings.PostInit();
+            }
+
+            throw new ArgumentException($"Cannot resolve settings: {settings}");
+        }
+    }
+
+    // FindTables function - C# port of find_tables from table.py
+    public static class TableFinderHelper
+    {
+        /// <summary>
+        /// Find tables on a page and return a TableFinder object.
+        /// This is the C# port of the find_tables function from table.py.
+        /// </summary>
+        public static TableFinder FindTables(
+            Page page,
+            Rect clip = null,
+            string vertical_strategy = "lines",
+            string horizontal_strategy = "lines",
+            List<object> vertical_lines = null,
+            List<object> horizontal_lines = null,
+            float snap_tolerance = TableFlags.TABLE_DEFAULT_SNAP_TOLERANCE,
+            float? snap_x_tolerance = null,
+            float? snap_y_tolerance = null,
+            float join_tolerance = TableFlags.TABLE_DEFAULT_JOIN_TOLERANCE,
+            float? join_x_tolerance = null,
+            float? join_y_tolerance = null,
+            float edge_min_length = 3.0f,
+            float min_words_vertical = TableFlags.TABLE_DEFAULT_MIN_WORDS_VERTICAL,
+            float min_words_horizontal = TableFlags.TABLE_DEFAULT_MIN_WORDS_HORIZONTAL,
+            float intersection_tolerance = 3.0f,
+            float? intersection_x_tolerance = null,
+            float? intersection_y_tolerance = null,
+            float text_tolerance = 3.0f,
+            float text_x_tolerance = 3.0f,
+            float text_y_tolerance = 3.0f,
+            string strategy = null,
+            List<Tuple<Point, Point>> add_lines = null,
+            List<Rect> add_boxes = null,
+            List<PathInfo> paths = null
+        )
+        {
+            // Clear global state
+            TableGlobals.CHARS.Clear();
+            TableGlobals.EDGES.Clear();
+            TableGlobals.TEXTPAGE = null;
+
+            // Handle page rotation
+            int oldRotation = page.Rotation;
+            bool needsRotationReset = oldRotation != 0;
+            Rect oldMediabox = null;
+
+            if (needsRotationReset)
+            {
+                oldMediabox = page.MediaBox;
+                page.SetRotation(0);
+                // Note: In Python, page_rotation_set0 also handles xref and mediabox changes
+                // For now, we'll just reset rotation - full implementation may require more complex handling
+            }
+
+            // Handle UNSET values (None in Python becomes null in C#, use TABLE_UNSET)
+            float snapX = snap_x_tolerance ?? TableFlags.TABLE_UNSET;
+            float snapY = snap_y_tolerance ?? TableFlags.TABLE_UNSET;
+            float joinX = join_x_tolerance ?? TableFlags.TABLE_UNSET;
+            float joinY = join_y_tolerance ?? TableFlags.TABLE_UNSET;
+            float interX = intersection_x_tolerance ?? TableFlags.TABLE_UNSET;
+            float interY = intersection_y_tolerance ?? TableFlags.TABLE_UNSET;
+
+            if (strategy != null)
+            {
+                vertical_strategy = strategy;
+                horizontal_strategy = strategy;
+            }
+
+            Dictionary<string, object> settings = new Dictionary<string, object>
+            {
+                { "vertical_strategy", vertical_strategy },
+                { "horizontal_strategy", horizontal_strategy },
+                { "explicit_vertical_lines", vertical_lines },
+                { "explicit_horizontal_lines", horizontal_lines },
+                { "snap_tolerance", snap_tolerance },
+                { "snap_x_tolerance", snapX },
+                { "snap_y_tolerance", snapY },
+                { "join_tolerance", join_tolerance },
+                { "join_x_tolerance", joinX },
+                { "join_y_tolerance", joinY },
+                { "edge_min_length", edge_min_length },
+                { "min_words_vertical", min_words_vertical },
+                { "min_words_horizontal", min_words_horizontal },
+                { "intersection_tolerance", intersection_tolerance },
+                { "intersection_x_tolerance", interX },
+                { "intersection_y_tolerance", interY },
+                { "text_tolerance", text_tolerance },
+                { "text_x_tolerance", text_x_tolerance },
+                { "text_y_tolerance", text_y_tolerance }
+            };
+
+            TableFinder tbf = null;
+            try
+            {
+                // Get layout information if available
+                List<Rect> layoutBoxes = new List<Rect>();
+                try
+                {
+                    // Try to get layout information - this may not be available in all MuPDF.NET versions
+                    // In Python: page.get_layout() and page.layout_information
+                    // For now, we'll skip this and proceed with table detection
+                }
+                catch
+                {
+                    // Layout information not available, continue without it
+                }
+
+                // Resolve settings
+                TableSettings tset = TableSettings.Resolve(settings);
+
+                // Create character list
+                TextPage textpage = TablePageProcessing.MakeChars(page, clip: clip);
+                TableGlobals.TEXTPAGE = textpage;
+
+                // Create edges
+                TablePageProcessing.MakeEdges(
+                    page,
+                    clip: clip,
+                    tset: tset,
+                    paths: paths,
+                    addLines: add_lines,
+                    addBoxes: add_boxes
+                );
+
+                // Create TableFinder
+                tbf = new TableFinder(page, tset);
+                tbf.textpage = textpage;
+
+                // Filter tables based on layout boxes if available
+                if (layoutBoxes.Count > 0)
+                {
+                    tbf.tables = tbf.tables.Where(tab =>
+                        layoutBoxes.Any(box => IoU(tab.bbox, box) >= 0.6f)
+                    ).ToList();
+
+                    // Find layout boxes that don't match any found table
+                    List<Rect> unmatchedBoxes = layoutBoxes.Where(box =>
+                        tbf.tables.All(tab => IoU(box, tab.bbox) < 0.6f)
+                    ).ToList();
+
+                    // Create tables from unmatched layout boxes
+                    if (unmatchedBoxes.Count > 0)
                     {
-                        throw new ArgumentException($"Invalid parameter: {kvp.Key}");
+                        // Extract words for make_table_from_bbox
+                        var words = textpage.ExtractWords();
+                        List<Rect> wordRects = words.Select(w => new Rect(w.X0, w.Y0, w.X1, w.Y1)).ToList();
+
+                        // Create a textpage with TABLE_DETECTOR_FLAGS for make_table_from_bbox
+                        TextPage tp2 = page.GetTextPage(flags: TableGlobals.TABLE_DETECTOR_FLAGS);
+
+                        foreach (Rect rect in unmatchedBoxes)
+                        {
+                            List<Rect> cells = TableHelpers.MakeTableFromBbox(tp2, wordRects, rect);
+                            if (cells.Count > 0)
+                            {
+                                tbf.tables.Add(new Table(page, cells));
+                            }
+                        }
                     }
                 }
 
-                return instance.PostInit();
+                // Set textpage for all tables
+                foreach (var table in tbf.tables)
+                {
+                    table.textpage = textpage;
+                }
             }
-            else
+            catch (Exception ex)
+            {
+                // Log exception (equivalent to pymupdf.message in Python)
+                System.Diagnostics.Debug.WriteLine($"find_tables: exception occurred: {ex.Message}");
+                return null;
+            }
+            finally
             {
-                throw new ArgumentException($"Cannot resolve settings: {settings}");
+                if (needsRotationReset && oldRotation != 0)
+                {
+                    page.SetRotation(oldRotation);
+                    // Note: Full page_rotation_reset would also restore mediabox and xref
+                }
             }
+
+            return tbf;
+        }
+
+        /// <summary>
+        /// Compute intersection over union (IoU) of two rectangles.
+        /// </summary>
+        private static float IoU(Rect r1, Rect r2)
+        {
+            float ix = Math.Max(0, Math.Min(r1.X1, r2.X1) - Math.Max(r1.X0, r2.X0));
+            float iy = Math.Max(0, Math.Min(r1.Y1, r2.Y1) - Math.Max(r1.Y0, r2.Y0));
+            float intersection = ix * iy;
+
+            if (intersection == 0)
+                return 0;
+
+            float area1 = (r1.X1 - r1.X0) * (r1.Y1 - r1.Y0);
+            float area2 = (r2.X1 - r2.X0) * (r2.Y1 - r2.Y0);
+            return intersection / (area1 + area2 - intersection);
         }
     }
 
+    // TableFinder class
     public class TableFinder
     {
-        private readonly Page page;
-        private readonly TableSettings settings;
-        private readonly List<Edge> edges;
-        private readonly Dictionary<Point, Intersection> intersections;
-        private readonly List<BBox> cells;
-        private readonly List<Table> tables;
-
-        private TextPage TEXTPAGE;
-        private List<Edge> EDGES;
-        private List<Character> CHARS;
-
-        public TableFinder(Page page, Rect clip, TableSettings settings = null)
-        {
-            TEXTPAGE = page.GetTextPage(clip, flags: (int)TextFlagsExtension.TEXTFLAGS_TEXT);
-            TEXTPAGE.Parent = page;
-            EDGES = new List<Edge>();
-            CHARS = new List<Character>();
-            make_chars(page, clip);
-            make_edges(page, clip, settings);
+        public Page page { get; set; }
+        public TextPage textpage { get; set; }
+        public TableSettings settings { get; set; }
+        public List<Edge> edges { get; set; }
+        public Dictionary<Tuple<float, float>, Dictionary<string, List<Edge>>> intersections { get; set; }
+        public List<Rect> cells { get; set; }
+        public List<Table> tables { get; set; }
+
+        public TableFinder(Page page, TableSettings settings = null)
+        {
             this.page = page;
-            this.settings = settings;
-            this.edges = get_edges();
-            this.intersections = edges_to_intersections(this.edges,
+            this.settings = settings ?? TableSettings.Resolve();
+            this.edges = GetEdges();
+            this.intersections = EdgeProcessing.EdgesToIntersections(
+                this.edges,
                 this.settings.intersection_x_tolerance,
-                this.settings.intersection_y_tolerance);
-            this.cells = intersections_to_cells(this.intersections);
-            this.tables = new List<Table>();
-
-            foreach (var cellGroup in cells_to_tables(this.page, this.cells))
-            {
-                this.tables.Add(new Table(this.page, cellGroup, CHARS));
-            }
+                this.settings.intersection_y_tolerance
+            );
+            this.cells = EdgeProcessing.IntersectionsToCells(this.intersections);
+            var cellGroups = EdgeProcessing.CellsToTables(this.page, this.cells);
+            this.tables = cellGroups.Select(cg => new Table(this.page, cg)).ToList();
         }
 
-        private List<Edge> get_edges()
+        private List<Edge> GetEdges()
         {
             var settings = this.settings;
+            var edges = new List<Edge>();
 
-            var strategy = settings.vertical_strategy;
-            if (strategy == "explicit") 
-            {
-                var lines = settings.explicit_vertical_lines;
-                if (lines.Count < 2)
-                {
-                    throw new Exception("If vertical_strategy == 'explicit', " +
-                        "explicit_vertical_lines " +
-                        "must be specified as a list/tuple of two or more " +
-                        "floats/ints.");
-                }
-            }
-            strategy = settings.horizontal_strategy;
-            if (strategy == "explicit")
+            // Validate explicit strategies
+            foreach (string orientation in new[] { "vertical", "horizontal" })
             {
-                var lines = settings.explicit_horizontal_lines;
-                if (lines.Count < 2)
+                string strategy = orientation == "vertical" ? settings.vertical_strategy : settings.horizontal_strategy;
+                if (strategy == "explicit")
                 {
-                    throw new Exception("If horizontal_strategy == 'explicit', " +
-                        "explicit_horizontal_lines " +
-                        "must be specified as a list/tuple of two or more " +
-                        "floats/ints.");
+                    var lines = orientation == "vertical" ? settings.explicit_vertical_lines : settings.explicit_horizontal_lines;
+                    if (lines == null || lines.Count < 2)
+                    {
+                        throw new ArgumentException(
+                            $"If {orientation}_strategy == 'explicit', " +
+                            $"explicit_{orientation}_lines must be specified as a list of two or more edges.");
+                    }
                 }
             }
 
-            string v_strat = settings.vertical_strategy;
-            string h_strat = settings.horizontal_strategy;
+            string vStrat = settings.vertical_strategy;
+            string hStrat = settings.horizontal_strategy;
 
-            List<Character> words = new List<Character>();
-            if (v_strat == "text" || h_strat == "text")
-                words = extract_words(CHARS, settings.text_settings);
+            List<Dictionary<string, object>> words = new List<Dictionary<string, object>>();
+            if (vStrat == "text" || hStrat == "text")
+            {
+                words = TextExtractionHelpers.ExtractWords(TableGlobals.CHARS, settings.text_settings ?? new Dictionary<string, object>());
+            }
 
-            List<Edge> v_explicit = new List<Edge>();
+            // Vertical edges
+            var vExplicit = new List<Edge>();
             if (settings.explicit_vertical_lines != null)
             {
                 foreach (var desc in settings.explicit_vertical_lines)
                 {
-                    if (desc is Edge descEdge)
+                    if (desc is float x)
                     {
-                        foreach (Edge e in obj_to_edges(descEdge))
+                        vExplicit.Add(new Edge
+                        {
+                            x0 = x,
+                            x1 = x,
+                            top = page.Rect.Y0,
+                            bottom = page.Rect.Y1,
+                            height = page.Rect.Height,
+                            orientation = "v"
+                        });
+                    }
+                    else if (desc is Dictionary<string, object> dict)
+                    {
+                        // Convert dictionary to Edge (similar to obj_to_edges in Python)
+                        var convertedEdges = EdgeProcessing.ObjToEdges(dict);
+                        foreach (var e in convertedEdges)
                         {
                             if (e.orientation == "v")
-                                v_explicit.Add(e);
+                                vExplicit.Add(e);
                         }
                     }
+                    else if (desc is Edge edge)
+                    {
+                        if (edge.orientation == "v")
+                            vExplicit.Add(edge);
+                    }
                 }
             }
 
-            List<Edge> v_base = new List<Edge>();
-            if (v_strat == "lines")
-                v_base = filter_edges(EDGES, "v");
-            else if (v_strat == "lines_strict")
-                v_base = filter_edges(EDGES, "v", edgeType: "lines");
-            else if (v_strat == "text")
-                v_base = words_to_edges_v(words, wordThreshold:(int)settings.min_words_vertical);
-            else if (v_strat == "explicit")
-                v_base.Clear();
-            else
-                v_base.Clear();
+            List<Edge> vBase = new List<Edge>();
+            if (vStrat == "lines")
+            {
+                vBase = TableGlobals.EDGES.Where(e => e.orientation == "v").ToList();
+            }
+            else if (vStrat == "lines_strict")
+            {
+                vBase = TableGlobals.EDGES.Where(e => e.orientation == "v" && e.object_type == "line").ToList();
+            }
+            else if (vStrat == "text")
+            {
+                vBase = EdgeProcessing.WordsToEdgesV(words, (int)settings.min_words_vertical);
+            }
 
-            List<Edge> v = v_base.Concat(v_explicit).ToList();
+            var v = vBase.Concat(vExplicit).ToList();
 
-            List<Edge> h_explicit = new List<Edge>();
+            // Horizontal edges
+            var hExplicit = new List<Edge>();
             if (settings.explicit_horizontal_lines != null)
             {
                 foreach (var desc in settings.explicit_horizontal_lines)
                 {
-                    if (desc is Edge descEdge)
+                    if (desc is float y)
                     {
-                        foreach (Edge e in obj_to_edges(descEdge))
+                        hExplicit.Add(new Edge
+                        {
+                            x0 = page.Rect.X0,
+                            x1 = page.Rect.X1,
+                            top = y,
+                            bottom = y,
+                            width = page.Rect.Width,
+                            orientation = "h"
+                        });
+                    }
+                    else if (desc is Dictionary<string, object> dict)
+                    {
+                        // Convert dictionary to Edge (similar to obj_to_edges in Python)
+                        var convertedEdges = EdgeProcessing.ObjToEdges(dict);
+                        foreach (var e in convertedEdges)
                         {
                             if (e.orientation == "h")
-                                h_explicit.Add(e);
+                                hExplicit.Add(e);
                         }
                     }
+                    else if (desc is Edge edge)
+                    {
+                        if (edge.orientation == "h")
+                            hExplicit.Add(edge);
+                    }
                 }
             }
 
-            List<Edge> h_base = new List<Edge>();
-            if (h_strat == "lines")
-                h_base = filter_edges(EDGES, "h");
-            else if (h_strat == "lines_strict")
-                h_base = filter_edges(EDGES, "h", edgeType: "lines");
-            else if (h_strat == "text")
-                h_base = words_to_edges_h(words, wordThreshold:(int)settings.min_words_horizontal);
-            else if (h_strat == "explicit")
-                h_base.Clear();
-            else
-                h_base.Clear();
-
-            List<Edge> h = h_base.Concat(h_explicit).ToList();
+            List<Edge> hBase = new List<Edge>();
+            if (hStrat == "lines")
+            {
+                hBase = TableGlobals.EDGES.Where(e => e.orientation == "h").ToList();
+            }
+            else if (hStrat == "lines_strict")
+            {
+                hBase = TableGlobals.EDGES.Where(e => e.orientation == "h" && e.object_type == "line").ToList();
+            }
+            else if (hStrat == "text")
+            {
+                hBase = EdgeProcessing.WordsToEdgesH(words, (int)settings.min_words_horizontal);
+            }
 
-            List<Edge> edges = new List<Edge>();
-            edges.AddRange(v);
-            edges.AddRange(h);
+            var h = hBase.Concat(hExplicit).ToList();
 
-            edges = merge_edges(
+            edges = v.Concat(h).ToList();
+            edges = EdgeProcessing.MergeEdges(
                 edges,
-                snap_x_tolerance: settings.snap_x_tolerance,
-                snap_y_tolerance: settings.snap_y_tolerance,
-                join_x_tolerance: settings.join_x_tolerance,
-                join_y_tolerance: settings.join_y_tolerance
-                );
+                settings.snap_x_tolerance,
+                settings.snap_y_tolerance,
+                settings.join_x_tolerance,
+                settings.join_y_tolerance
+            );
+
+            return EdgeProcessing.FilterEdges(edges, minLength: settings.edge_min_length);
+        }
 
-            return filter_edges(edges, minLength: settings.edge_min_length);
+        public static List<Table> FindTables(Page page, Rect clip, TableSettings settings)
+        {
+            var finder = new TableFinder(page, settings);
+            return finder.tables;
         }
+
         public Table this[int i]
         {
             get
             {
-                int tcount = this.tables.Count;
-                if (i >= tcount || i < 0)
-                {
+                int tcount = tables.Count;
+                if (i >= tcount)
                     throw new IndexOutOfRangeException("table not on page");
-                }
-                return this.tables[i];
+                while (i < 0)
+                    i += tcount;
+                return tables[i];
             }
         }
+    }
 
-        // Nullify page rotation.
-        // To correctly detect tables, page rotation must be zero.
-        // This function performs the necessary adjustments and returns information
-        // for reverting this changes.
-        private static Page page_rotation_set0(Page page)
+    // Functions for making chars and edges from page
+    internal static class TablePageProcessing
+    {
+        // make_chars - Extract text as "rawdict" to fill CHARS
+        internal static TextPage MakeChars(Page page, Rect clip = null)
         {
-            Rect mediabox = page.MediaBox;
-            int rot = page.Rotation; // contains normalized rotation value
-            // need to derotate the page's content
-            Rect mb = page.MediaBox;  // current mediabox
-
-            Matrix mat0 = new Matrix();
-            if (rot == 90)
-            {
-                // before derotation, shift content horizontally
-                mat0 = new Matrix(1, 0, 0, 1, mb.Y1 - mb.X1 - mb.X0 - mb.Y0, 0);
-            }
-            else if (rot == 270)
-            {
-                // before derotation, shift content vertically
-                mat0 = new Matrix(1, 0, 0, 1, 0, mb.X1 - mb.Y1 - mb.Y0 - mb.X0);
-            }
-            else
-            {
-                mat0 = new Matrix(1, 0, 0, 1, -2 * mb.X0, -2 * mb.Y0);
-            }
-
-            // swap x- and y-coordinates
-            if (rot == 90 || rot == 270)
-            {
-                float x0 = mb.X0;
-                float y0 = mb.Y0;
-                float x1 = mb.X1;
-                float y1 = mb.Y1;
-                mb.X0 = y0;
-                mb.Y0 = x0;
-                mb.X1 = y1;
-                mb.X1 = x1;
-                page.SetMediaBox(mb);
-            }
-
-            page.SetRotation(0);
-
-            return page;
-        }
+            int pageNumber = page.Number + 1;
+            float pageHeight = page.Rect.Height;
+            var ctm = page.TransformationMatrix;
 
-        private void make_chars(Page page, Rect clip = null)
-        {
-            int page_number = page.Number + 1;
-            float page_height = page.Rect.Height;
-            Matrix ctm = page.TransformationMatrix;
-            float doctop_base = page_height * page.Number;
-            List<Block> blocks = page.GetText("rawdict", textpage: TEXTPAGE).Blocks;
+            var flags = TableGlobals.FLAGS;
+            var textpage = page.GetTextPage(clip: clip, flags: flags);
+            TableGlobals.TEXTPAGE = textpage;
 
-            //List<Block> blocks = (page.GetText("rawdict", clip, flags: (int)TextFlagsExtension.TEXTFLAGS_TEXT) as PageInfo).Blocks;
+            var pageInfo = textpage.ExtractRAWDict(cropbox: clip, sort: false);
+            float doctopBase = pageHeight * page.Number;
 
-            foreach (var block in blocks)
+            foreach (var block in pageInfo.Blocks)
             {
+                if (block.Lines == null) continue;
+
                 foreach (var line in block.Lines)
                 {
-                    Point ldir = line.Dir;  // = (cosine, sine) of angle
-                    ldir = new Point((float)Math.Round(ldir.X, 4), (float)Math.Round(ldir.Y, 4));
-                    Matrix matrix = new Matrix(ldir.X, -ldir.Y, ldir.Y, ldir.X, 0, 0);
-                    bool upright = ldir.Y == 0f;
+                    var ldir = line.Dir;
+                    var ldirRounded = Tuple.Create((float)Math.Round(ldir.X, 4), (float)Math.Round(ldir.Y, 4));
+                    var matrix = new Matrix(ldirRounded.Item1, -ldirRounded.Item2, ldirRounded.Item2, ldirRounded.Item1, 0, 0);
+                    bool upright = ldirRounded.Item2 == 0;
 
-                    foreach (var span in line.Spans.OrderBy(s => s.Bbox.X0))
+                    if (line.Spans == null) continue;
+                    var sortedSpans = line.Spans.OrderBy(s => s.Bbox.X0).ToList();
+
+                    foreach (var span in sortedSpans)
                     {
                         string fontname = span.Font;
                         float fontsize = span.Size;
-                        int color = span.Color;
-
-                        foreach (var character in span.Chars.OrderBy(c => c.Bbox.x0))
+                        bool spanBold = ((int)span.Flags & (int)FontStyle.TEXT_FONT_BOLD) != 0;
+                        var colorInt = span.Color;
+                        
+                        // Extract RGB from int color (ARGB format: AARRGGBB)
+                        // Normalize to 0-1 range for PDF color space
+                        float r = ((colorInt >> 16) & 0xFF) / 255.0f;
+                        float g = ((colorInt >> 8) & 0xFF) / 255.0f;
+                        float b = (colorInt & 0xFF) / 255.0f;
+
+                        if (span.Chars == null) continue;
+                        var sortedChars = span.Chars.OrderBy(c => c.Bbox.x0).ToList();
+
+                        foreach (var char_ in sortedChars)
                         {
-                            Rect bbox = new Rect(character.Bbox);
-                            Rect bbox_ctm = bbox * ctm;
-                            Point origin = new Point(character.Origin) * ctm;
-
+                            var charBbox = char_.Bbox;
+                            var bboxCtm = new Rect(charBbox) * ctm;
+                            var origin = new Point(char_.Origin) * ctm;
                             matrix.E = origin.X;
                             matrix.F = origin.Y;
+                            string text = char_.C.ToString();
 
-                            string text = character.C.ToString();
-                            var charDict = new Character();
-                            charDict.adv = upright ? bbox.X1 - bbox.X0 : bbox.Y1 - bbox.Y0;
-                            charDict.bottom = bbox.Y1;
-                            charDict.doctop = bbox.Y0 + doctop_base;
-                            charDict.fontname = fontname;
-                            charDict.height = bbox.Y1 - bbox.Y0;
-                            charDict.matrix = matrix;
-                            charDict.ncs = "DeviceRGB";
-                            charDict.non_stroking_color = color;
-                            charDict.non_stroking_pattern = null;
-                            charDict.object_type = "char";
-                            charDict.page_number = page_number;
-                            charDict.size = upright ? fontsize : bbox.Y1 - bbox.Y0;
-                            charDict.stroking_color = color;
-                            charDict.stroking_pattern = null;
-                            charDict.text = text;
-                            charDict.top = bbox.Y0;
-                            charDict.upright = upright;
-                            charDict.width = bbox.X1 - bbox.X0;
-                            charDict.x0 = bbox.X0;
-                            charDict.x1 = bbox.X1;
-                            charDict.y0 = bbox_ctm.Y0;
-                            charDict.y1 = bbox_ctm.Y1;
-                            CHARS.Add(charDict);
+                            var charDict = new CharDict
+                            {
+                                adv = upright ? (charBbox.x1 - charBbox.x0) : (charBbox.y1 - charBbox.y0),
+                                bottom = charBbox.y1,
+                                doctop = charBbox.y0 + doctopBase,
+                                fontname = fontname,
+                                height = charBbox.y1 - charBbox.y0,
+                                matrix = Tuple.Create(matrix.A, matrix.B, matrix.C, matrix.D, matrix.E, matrix.F),
+                                ncs = "DeviceRGB",
+                                non_stroking_color = Tuple.Create(r, g, b),
+                                non_stroking_pattern = null,
+                                object_type = "char",
+                                page_number = pageNumber,
+                                size = upright ? fontsize : (charBbox.y1 - charBbox.y0),
+                                stroking_color = Tuple.Create(r, g, b),
+                                stroking_pattern = null,
+                                bold = spanBold,
+                                text = text,
+                                top = charBbox.y0,
+                                upright = upright,
+                                width = charBbox.x1 - charBbox.x0,
+                                x0 = charBbox.x0,
+                                x1 = charBbox.x1,
+                                y0 = bboxCtm.Y0,
+                                y1 = bboxCtm.Y1
+                            };
+
+                            TableGlobals.CHARS.Add(charDict);
                         }
                     }
                 }
             }
-        }
 
-        // ------------------------------------------------------------------------
-        // Extract all page vector graphics to fill the EDGES list.
-        // We are ignoring Bézier curves completely and are converting everything
-        // else to lines.
-        // ------------------------------------------------------------------------
+            return textpage;
+        }
 
-        private void make_edges(Page page, Rect clip = null, TableSettings tset = null)
+        // make_edges - Extract all page vector graphics to fill the EDGES list
+        internal static void MakeEdges(
+            Page page,
+            Rect clip = null,
+            TableSettings tset = null,
+            List<PathInfo> paths = null,
+            List<Tuple<Point, Point>> addLines = null,
+            List<Rect> addBoxes = null)
         {
-            float snap_x = tset.snap_x_tolerance;
-            float snap_y = tset.snap_y_tolerance;
-            float min_length = tset.edge_min_length;
+            if (tset == null)
+                tset = TableSettings.Resolve();
 
+            float snapX = tset.snap_x_tolerance;
+            float snapY = tset.snap_y_tolerance;
+            float minLength = tset.edge_min_length;
             bool linesStrict = tset.vertical_strategy == "lines_strict" || tset.horizontal_strategy == "lines_strict";
 
-            float page_height = page.Rect.Height;
-            float doctop_basis = page.Number * page_height;
-            int page_number = page.Number + 1;
-            Rect prect = page.Rect;
+            float pageHeight = page.Rect.Height;
+            float doctopBasis = page.Number * pageHeight;
+            int pageNumber = page.Number + 1;
+            var prect = page.Rect;
 
             if (page.Rotation == 90 || page.Rotation == 270)
             {
-                float w = prect.BottomRight.X;
-                float h = prect.BottomRight.Y;
+                float w = prect.Width;
+                float h = prect.Height;
                 prect = new Rect(0, 0, h, w);
             }
 
-            if (clip != null)
-                clip = new Rect(clip);
-            else
+            if (clip == null)
                 clip = prect;
+            else
+                clip = new Rect(clip.X0, clip.Y0, clip.X1, clip.Y1);
 
-            // Detect whether r1, r2 are neighbors.
-            // Defined as:
-            // The minimum distance between points of r1 and points of r2 is not
-            // larger than some delta.
-            // This check supports empty rect-likes and thus also lines.
-            // Note:
-            // This type of check is MUCH faster than native Rect containment checks.
-            bool are_neighbors(Rect r1, Rect r2)
-            {
-                return (    // check if x-coordinates of r1 are within those of r2
-                    (r2.X0 - snap_x <= r1.X0 && r1.X0 <= r2.X1 + snap_x) ||
-                    (r2.X0 - snap_x <= r1.X1 && r1.X1 <= r2.X1 + snap_x)
-                ) && (
-                    (r2.Y0 - snap_y <= r1.Y0 && r1.Y0 <= r2.Y1 + snap_y) ||
-                    (r2.Y0 - snap_y <= r1.Y1 && r1.Y1 <= r2.Y1 + snap_y)
-                ) ||    // same check with r1 / r2 exchanging their roles (this is necessary!)
-                (
-                    (r1.X0 - snap_x <= r2.X0 && r2.X0 <= r1.X1 + snap_x) ||
-                    (r1.X0 - snap_x <= r2.X1 && r2.X1 <= r1.X1 + snap_x)
-                ) && (
-                    (r1.Y0 - snap_y <= r2.Y0 && r2.Y0 <= r1.Y1 + snap_y) ||
-                    (r1.Y0 - snap_y <= r2.Y1 && r2.Y1 <= r1.Y1 + snap_y)
-                );
+            // Helper: Check if two rects are neighbors
+            bool AreNeighbors(Rect r1, Rect r2)
+            {
+                if ((r2.X0 - snapX <= r1.X0 && r1.X0 <= r2.X1 + snapX ||
+                     r2.X0 - snapX <= r1.X1 && r1.X1 <= r2.X1 + snapX) &&
+                    (r2.Y0 - snapY <= r1.Y0 && r1.Y0 <= r2.Y1 + snapY ||
+                     r2.Y0 - snapY <= r1.Y1 && r1.Y1 <= r2.Y1 + snapY))
+                    return true;
+
+                if ((r1.X0 - snapX <= r2.X0 && r2.X0 <= r1.X1 + snapX ||
+                     r1.X0 - snapX <= r2.X1 && r2.X1 <= r1.X1 + snapX) &&
+                    (r1.Y0 - snapY <= r2.Y0 && r2.Y0 <= r1.Y1 + snapY ||
+                     r1.Y0 - snapY <= r2.Y1 && r2.Y1 <= r1.Y1 + snapY))
+                    return true;
+
+                return false;
             }
 
-            // Detect and join rectangles of "connected" vector graphics.
-            (List<Rect>, List<PathInfo>) clean_graphics()
+            // Helper: Clean graphics - detect and join rectangles
+            Tuple<List<Rect>, List<PathInfo>> CleanGraphics(List<PathInfo> npaths = null)
             {
-                // Detect and join rectangles of "connected" vector graphics.
-                List<PathInfo> _paths = new List<PathInfo>();
+                List<PathInfo> allpaths = npaths ?? page.GetDrawings();
+                var pathsList = new List<PathInfo>();
 
-                foreach (var p in page.GetDrawings())
+                foreach (var p in allpaths)
                 {
-                    // ignore fill-only graphics if they do not simulate lines,
-                    // which means one of width or height are small.
-                    if (p.Type == "f" && linesStrict && p.Rect.Width > snap_x && p.Rect.Height > snap_y)
-                    {
+                    if (linesStrict && p.Type == "f" && p.Rect.Width > snapX && p.Rect.Height > snapY)
                         continue;
-                    }
-                    _paths.Add(p);
-                }
-
-                // start with all vector graphics rectangles
-                List<Rect> prects = _paths.Select(p => p.Rect)
-                                         .Distinct()
-                                         .OrderBy(r => (r.Y1, r.X0))
-                                         .ToList();
-
-                List<BBox> _bboxes = new List<BBox>();
-                foreach (var p in prects)
-                {
-                    _bboxes.Add(BBox.RectToBBox(p));
-                }
-                _bboxes = _bboxes.Distinct().ToList();
-                prects.Clear();
-                foreach (var b in _bboxes)
-                {
-                    prects.Add(BBox.BBoxToRect(b));
+                    pathsList.Add(p);
                 }
 
-                List<Rect> newRects = new List<Rect>(); // the final list of joined rectangles
+                var prects = pathsList.Select(p => p.Rect).Distinct()
+                    .OrderBy(r => r.Y1).ThenBy(r => r.X0).ToList();
+                var newRects = new List<Rect>();
 
-                // ----------------------------------------------------------------
-                // Strategy: Join rectangles that "almost touch" each other.
-                // Extend first rectangle with any other that is a "neighbor".
-                // Then move it to the final list and continue with the rest.
-                // ----------------------------------------------------------------
-                while (prects.Count > 0) // The algorithm will empty this list.
+                while (prects.Count > 0)
                 {
-                    Rect prect0 = prects[0]; // Copy of the first rectangle (performance reasons).
+                    var prect0 = prects[0];
                     bool repeat = true;
 
-                    while (repeat) // This loop extends the first rect in the list.
+                    while (repeat)
                     {
-                        repeat = false; // Set to true again if some other rect touches.
-
-                        for (int i = prects.Count - 1; i > 0; i--) // Run backwards.
+                        repeat = false;
+                        for (int i = prects.Count - 1; i > 0; i--)
                         {
-                            if (are_neighbors(prect0, prects[i])) // Close enough to rect 0?
+                            if (AreNeighbors(prect0, prects[i]))
                             {
-                                // Extend rect 0.
-                                prect0.X0 = Math.Min(prect0.X0, prects[i].X0);
-                                prect0.Y0 = Math.Min(prect0.Y0, prects[i].Y0);
-                                prect0.X1 = Math.Max(prect0.X1, prects[i].X1);
-                                prect0.Y1 = Math.Max(prect0.Y1, prects[i].Y1);
-
-                                prects.RemoveAt(i); // Delete this rect.
-                                repeat = true; // Keep checking the rest.
+                                prect0 = prect0 | prects[i];
+                                prects.RemoveAt(i);
+                                repeat = true;
                             }
                         }
                     }
 
-                    // Move rect 0 over to the result list if there is some text in it.
-                    if (!string.IsNullOrWhiteSpace(page.GetTextbox(prect0, textPage: TEXTPAGE)))
-                    {
-                        // Contains text, so accept it as a table bbox candidate.
+                    if (TableHelpers.CharsInRect(TableGlobals.CHARS, prect0))
                         newRects.Add(prect0);
-                    }
 
-                    prects.RemoveAt(0); // Remove from rect list.
+                    prects.RemoveAt(0);
                 }
 
-                return (newRects, _paths);
+                return Tuple.Create(newRects, pathsList);
             }
 
-            (List<Rect> bboxes, List<PathInfo> paths) = clean_graphics();
+            var (bboxes, cleanedPaths) = CleanGraphics(paths);
 
+            // Helper: Check if line is roughly axis-parallel
             bool IsParallel(Point p1, Point p2)
             {
-                if (p1 == null || p2 == null)
-                {
-                    return false;
-                }
-                // Check if the line is roughly parallel to either the X or Y axis
-                if (Math.Abs(p1.X - p2.X) <= snap_x || Math.Abs(p1.Y - p2.Y) <= snap_y)
-                {
-                    return true;
-                }
-                return false;
+                return Math.Abs(p1.X - p2.X) <= snapX || Math.Abs(p1.Y - p2.Y) <= snapY;
             }
 
-            // Given 2 points, make a line dictionary for table detection.
-            Edge make_line(PathInfo p, Point p1, Point p2, Rect _clip)
+            // Helper: Make line dictionary
+            Dictionary<string, object> MakeLine(PathInfo p, Point p1, Point p2, Rect clipRect)
             {
-                if (!IsParallel(p1, p2))  // only accepting axis-parallel lines
-                {
+                if (!IsParallel(p1, p2))
                     return null;
-                }
 
-                // Compute the extremal values
                 float x0 = Math.Min(p1.X, p2.X);
                 float x1 = Math.Max(p1.X, p2.X);
                 float y0 = Math.Min(p1.Y, p2.Y);
                 float y1 = Math.Max(p1.Y, p2.Y);
 
-                // Check for outside _clip
-                if (x0 > _clip.X1 || x1 < _clip.X0 || y0 > _clip.Y1 || y1 < _clip.Y0)
-                {
+                if (x0 > clipRect.X1 || x1 < clipRect.X0 || y0 > clipRect.Y1 || y1 < clipRect.Y0)
                     return null;
-                }
-
-                if (x0 < _clip.X0) x0 = _clip.X0;  // Adjust to _clip boundary
-                if (x1 > _clip.X1) x1 = _clip.X1;  // Adjust to _clip boundary
-                if (y0 < _clip.Y0) y0 = _clip.Y0;  // Adjust to _clip boundary
-                if (y1 > _clip.Y1) y1 = _clip.Y1;  // Adjust to _clip boundary
 
-                float width = x1 - x0;  // From adjusted values
-                float height = y1 - y0;  // From adjusted values
+                if (x0 < clipRect.X0) x0 = clipRect.X0;
+                if (x1 > clipRect.X1) x1 = clipRect.X1;
+                if (y0 < clipRect.Y0) y0 = clipRect.Y0;
+                if (y1 > clipRect.Y1) y1 = clipRect.Y1;
 
+                float width = x1 - x0;
+                float height = y1 - y0;
                 if (width == 0 && height == 0)
-                {
-                    return null;  // Nothing left to deal with
-                }
-
-                Edge line_dict = new Edge();
-                line_dict.x0 = x0;
-                line_dict.y0 = page_height - y0;
-                line_dict.x1 = x1;
-                line_dict.y1 = page_height - y1;
-                line_dict.width = width;
-                line_dict.height = height;
-                line_dict.pts = new Point[] { new Point(x0, y0), new Point(x1, y1) };
-                line_dict.linewidth = p.Width;
-                line_dict.stroke = true;
-                line_dict.fill = false;
-                line_dict.evenodd = false;
-                line_dict.stroking_color = (p.Color != null && p.Color.Length > 0) ? p.Color : p.Fill;
-                line_dict.non_stroking_color = null;
-                line_dict.object_type = "line";
-                line_dict.page_number = page_number;
-                line_dict.stroking_pattern = null;
-                line_dict.non_stroking_pattern = null;
-                line_dict.top = y0;
-                line_dict.bottom = y1;
-                line_dict.doctop = y0 + doctop_basis;
+                    return null;
 
-                return line_dict;
+                return new Dictionary<string, object>
+                {
+                    { "x0", x0 },
+                    { "y0", pageHeight - y0 },
+                    { "x1", x1 },
+                    { "y1", pageHeight - y1 },
+                    { "width", width },
+                    { "height", height },
+                    { "pts", new List<object> { new List<object> { x0, y0 }, new List<object> { x1, y1 } } },
+                    { "linewidth", p.Width },
+                    { "stroke", true },
+                    { "fill", false },
+                    { "evenodd", false },
+                    { "stroking_color", p.Color ?? p.Fill },
+                    { "non_stroking_color", null },
+                    { "object_type", "line" },
+                    { "page_number", pageNumber },
+                    { "stroking_pattern", null },
+                    { "non_stroking_pattern", null },
+                    { "top", y0 },
+                    { "bottom", y1 },
+                    { "doctop", y0 + doctopBasis }
+                };
             }
 
-            foreach (PathInfo p in paths)
+            // Process paths
+            foreach (var p in cleanedPaths)
             {
-                List<Item> items = p.Items;  // items in this path
+                if (p.Items == null) continue;
+
+                var items = new List<Item>(p.Items);
 
-                // if 'closePath', add a line from last to first point
-                if (p.ClosePath && items.First().Type == "l" && items.Last().Type == "l")
+                // If closePath, add line from last to first point
+                if (p.ClosePath && items.Count > 0 && items[0].Type == "l" && items[items.Count - 1].Type == "l")
                 {
-                    Item line = new Item()
+                    var lastItem = items[items.Count - 1];
+                    var firstItem = items[0];
+                    if (lastItem.P2 != null && firstItem.P1 != null)
                     {
-                        Type = "l",
-                        LastPoint = new Point(items.First().P1),
-                        P1 = new Point(items.Last().LastPoint)
-                    };
-                    items.Add(line);
+                        items.Add(new Item
+                        {
+                            Type = "l",
+                            P1 = lastItem.P2,
+                            P2 = firstItem.P1
+                        });
+                    }
                 }
 
-                foreach (Item item in items)
+                foreach (var item in items)
                 {
-                    if (item.Type != "l" && item.Type != "re" && item.Type != "qu") // ignore anything else
-                        continue;
-
-                    if (item.Type == "l")  // a line
+                    if (item.Type == "l") // Line
                     {
-                        var p1 = item.P1;
-                        var p2 = item.P2;
-                        var lineDict = make_line(p, p1, p2, clip);
-                        if (lineDict != null)
+                        if (item.P1 != null && item.LastPoint != null)
                         {
-                            EDGES.Add(Global.line_to_edge(lineDict));
+                            var lineDict = MakeLine(p, item.P1, item.LastPoint, clip);
+                            if (lineDict != null)
+                            {
+                                var edge = EdgeProcessing.LineToEdge(lineDict);
+                                TableGlobals.EDGES.Add(edge);
+                            }
                         }
                     }
-                    else if (item.Type == "re")
+                    else if (item.Type == "re" && item.Rect != null) // Rectangle
                     {
-                        // A rectangle: decompose into 4 lines
-                        Rect rect = item.Rect;  // Normalize the rectangle
+                        var rect = item.Rect;
                         rect.Normalize();
 
-                        // If it simulates a vertical line
-                        if (rect.Width <= min_length && rect.Width < rect.Height)
+                        // Check if simulates a vertical line
+                        if (rect.Width <= minLength && rect.Width < rect.Height)
                         {
-                            float x = (rect.X1 + rect.X0) / 2;
-                            Point p1 = new Point(x, rect.Y0);
-                            Point p2 = new Point(x, rect.Y1);
-                            var lineDict = make_line(p, p1, p2, clip);
+                            float x = Math.Abs(rect.X1 + rect.X0) / 2;
+                            var p1 = new Point(x, rect.Y0);
+                            var p2 = new Point(x, rect.Y1);
+                            var lineDict = MakeLine(p, p1, p2, clip);
                             if (lineDict != null)
                             {
-                                EDGES.Add(line_to_edge(lineDict));
+                                var edge = EdgeProcessing.LineToEdge(lineDict);
+                                TableGlobals.EDGES.Add(edge);
                             }
                             continue;
                         }
 
-                        // If it simulates a horizontal line
-                        if (rect.Height <= min_length && rect.Height < rect.Width)
+                        // Check if simulates a horizontal line
+                        if (rect.Height <= minLength && rect.Height < rect.Width)
                         {
-                            float y = (rect.Y1 + rect.Y0) / 2;
+                            float y = Math.Abs(rect.Y1 + rect.Y0) / 2;
                             var p1 = new Point(rect.X0, y);
                             var p2 = new Point(rect.X1, y);
-                            var lineDict = make_line(p, p1, p2, clip);
+                            var lineDict = MakeLine(p, p1, p2, clip);
                             if (lineDict != null)
                             {
-                                EDGES.Add(line_to_edge(lineDict));
+                                var edge = EdgeProcessing.LineToEdge(lineDict);
+                                TableGlobals.EDGES.Add(edge);
                             }
                             continue;
                         }
 
-                        var line_dict = make_line(p, rect.TopLeft, rect.BottomLeft, clip);
-                        if (line_dict != null)
-                            EDGES.Add(line_to_edge(line_dict));
-                        line_dict = make_line(p, rect.BottomLeft, rect.BottomRight, clip);
-                        if (line_dict != null)
-                            EDGES.Add(line_to_edge(line_dict));
-                        line_dict = make_line(p, rect.BottomRight, rect.TopRight, clip);
-                        if (line_dict != null)
-                            EDGES.Add(line_to_edge(line_dict));
-                        line_dict = make_line(p, rect.TopRight, rect.TopLeft, clip);
-                        if (line_dict != null)
-                            EDGES.Add(line_to_edge(line_dict));
+                        // Decompose rectangle into 4 lines
+                        var tl = new Point(rect.X0, rect.Y0);
+                        var tr = new Point(rect.X1, rect.Y0);
+                        var bl = new Point(rect.X0, rect.Y1);
+                        var br = new Point(rect.X1, rect.Y1);
+
+                        var lineDict1 = MakeLine(p, tl, bl, clip);
+                        if (lineDict1 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict1));
+
+                        var lineDict2 = MakeLine(p, bl, br, clip);
+                        if (lineDict2 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict2));
+
+                        var lineDict3 = MakeLine(p, br, tr, clip);
+                        if (lineDict3 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict3));
+
+                        var lineDict4 = MakeLine(p, tr, tl, clip);
+                        if (lineDict4 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict4));
                     }
-                    else  // must be a quad (quads have 4 points)
+                    else if (item.Type == "qu" && item.Quad != null) // Quad
                     {
-                        Point ul = item.Quad.UpperLeft;
-                        Point ur = item.Quad.UpperRight;
-                        Point ll = item.Quad.LowerLeft;
-                        Point lr = item.Quad.LowerRight;
+                        var quad = item.Quad;
+                        var ul = quad.UpperLeft;
+                        var ur = quad.UpperRight;
+                        var ll = quad.LowerLeft;
+                        var lr = quad.LowerRight;
 
-                        var lineDict = make_line(p, ul, ll, clip);
-                        if (lineDict != null)
-                        {
-                            EDGES.Add(line_to_edge(lineDict));
-                        }
+                        var lineDict1 = MakeLine(p, ul, ll, clip);
+                        if (lineDict1 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict1));
 
-                        lineDict = make_line(p, ll, lr, clip);
-                        if (lineDict != null)
-                        {
-                            EDGES.Add(line_to_edge(lineDict));
-                        }
+                        var lineDict2 = MakeLine(p, ll, lr, clip);
+                        if (lineDict2 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict2));
 
-                        lineDict = make_line(p, lr, ur, clip);
-                        if (lineDict != null)
-                        {
-                            EDGES.Add(line_to_edge(lineDict));
-                        }
+                        var lineDict3 = MakeLine(p, lr, ur, clip);
+                        if (lineDict3 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict3));
 
-                        lineDict = make_line(p, ur, ul, clip);
-                        if (lineDict != null)
-                        {
-                            EDGES.Add(line_to_edge(lineDict));
-                        }
+                        var lineDict4 = MakeLine(p, ur, ul, clip);
+                        if (lineDict4 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict4));
                     }
                 }
             }
 
-            // Define the path with color, fill, and width
-            PathInfo path = new PathInfo();
-            path.Color = new float[] { 0f, 0f, 0f };
-            path.Fill = null;
-            path.Width = 1f;
-
-            foreach (Rect bbox in bboxes)
+            // Add border lines for all enveloping bboxes
+            var defaultPath = new PathInfo { Color = new float[] { 0, 0, 0 }, Fill = null, Width = 1 };
+            foreach (var bbox in bboxes)
             {
-                var lineDict = make_line(path, bbox.TopLeft, bbox.TopRight, clip);
-                if (lineDict != null)
-                    EDGES.Add(line_to_edge(lineDict));
+                var tl = new Point(bbox.X0, bbox.Y0);
+                var tr = new Point(bbox.X1, bbox.Y0);
+                var bl = new Point(bbox.X0, bbox.Y1);
+                var br = new Point(bbox.X1, bbox.Y1);
 
-                lineDict = make_line(path, bbox.BottomLeft, bbox.BottomRight, clip);
-                if (lineDict != null)
-                    EDGES.Add(line_to_edge(lineDict));
+                var lineDict1 = MakeLine(defaultPath, tl, tr, clip);
+                if (lineDict1 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict1));
 
-                lineDict = make_line(path, bbox.TopLeft, bbox.BottomLeft, clip);
-                if (lineDict != null)
-                    EDGES.Add(line_to_edge(lineDict));
-
-                lineDict = make_line(path, bbox.TopRight, bbox.BottomRight, clip);
-                if (lineDict != null)
-                    EDGES.Add(line_to_edge(lineDict));
-            }
+                var lineDict2 = MakeLine(defaultPath, bl, br, clip);
+                if (lineDict2 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict2));
 
-            return;
-        }
+                var lineDict3 = MakeLine(defaultPath, tl, bl, clip);
+                if (lineDict3 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict3));
 
-        public static List<Table> FindTables(
-                Page paramPage,
-                Rect clip,
-                TableSettings tset
-            )
-        {
-            Page page = new Page(paramPage.GetPdfPage(), paramPage.Parent);
+                var lineDict4 = MakeLine(defaultPath, tr, br, clip);
+                if (lineDict4 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict4));
+            }
 
-            if (page.Rotation != 0)
+            // Add user-specified lines
+            if (addLines != null)
             {
-                page = page_rotation_set0(page);
+                foreach (var (p1, p2) in addLines)
+                {
+                    var lineDict = MakeLine(defaultPath, p1, p2, clip);
+                    if (lineDict != null)
+                        TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict));
+                }
             }
 
-            TableFinder tableFinder = new TableFinder(paramPage, clip, tset);
+            // Add user-specified boxes
+            if (addBoxes != null)
+            {
+                foreach (var box in addBoxes)
+                {
+                    var tl = new Point(box.X0, box.Y0);
+                    var tr = new Point(box.X1, box.Y0);
+                    var bl = new Point(box.X0, box.Y1);
+                    var br = new Point(box.X1, box.Y1);
+
+                    var lineDict1 = MakeLine(defaultPath, tl, bl, clip);
+                    if (lineDict1 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict1));
 
-            return tableFinder.tables;
+                    var lineDict2 = MakeLine(defaultPath, bl, br, clip);
+                    if (lineDict2 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict2));
+
+                    var lineDict3 = MakeLine(defaultPath, br, tr, clip);
+                    if (lineDict3 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict3));
+
+                    var lineDict4 = MakeLine(defaultPath, tr, tl, clip);
+                    if (lineDict4 != null) TableGlobals.EDGES.Add(EdgeProcessing.LineToEdge(lineDict4));
+                }
+            }
         }
     }
 }
diff --git a/MuPDF.NET/TextPage.cs b/MuPDF.NET/TextPage.cs
index b603815..aad7d6b 100644
--- a/MuPDF.NET/TextPage.cs
+++ b/MuPDF.NET/TextPage.cs
@@ -22,28 +22,40 @@ static TextPage()
         /// <summary>
         /// Rect of Stext Page
         /// </summary>
+        private FzRect _mediaBox = null;
         private FzRect MediaBox
         {
-            get { return new FzRect(_nativeTextPage.m_internal.mediabox); }
+            get { 
+                if (_mediaBox == null)
+                {
+                    _mediaBox = new FzRect(_nativeTextPage.m_internal.mediabox);
+                }
+                return _mediaBox;
+            }
         }
 
         /// <summary>
         /// Block List of Text
         /// </summary>
+        private List<FzStextBlock> _blocks = null;
         public List<FzStextBlock> Blocks
         {
             get
             {
-                List<FzStextBlock> blocks = new List<FzStextBlock>();
-                for (
-                    fz_stext_block block = _nativeTextPage.m_internal.first_block;
-                    block != null;
-                    block = block.next
-                )
+                if (_blocks == null)
                 {
-                    blocks.Add(new FzStextBlock(block));
+                    List<FzStextBlock> blocks = new List<FzStextBlock>();
+                    for (
+                        fz_stext_block block = _nativeTextPage.m_internal.first_block;
+                        block != null;
+                        block = block.next
+                    )
+                    {
+                        blocks.Add(new FzStextBlock(block));
+                    }
+                    _blocks = blocks;
                 }
-                return blocks;
+                return _blocks;
             }
         }
 
@@ -1138,7 +1150,7 @@ internal void MakeTextPage2Dict(PageInfo pageDict, bool raw)
                     blockDict.Size = mupdf.mupdf.fz_image_size(image);
                     blockDict.Image = Utils.BinFromBuffer(buf);
                 }
-                else
+                else if (block.m_internal.type == (int)STextBlockType.FZ_STEXT_BLOCK_TEXT)
                 {
                     List<Line> lineList = new List<Line>();
 
diff --git a/MuPDF.NET/Utils.cs b/MuPDF.NET/Utils.cs
index 2161bf8..ff39097 100644
--- a/MuPDF.NET/Utils.cs
+++ b/MuPDF.NET/Utils.cs
@@ -1,4 +1,4 @@
-﻿using System;
+using System;
 using System.Collections.Generic;
 using System.Data;
 using System.Diagnostics;
@@ -19,7 +19,6 @@
 using Newtonsoft.Json;
 using SkiaSharp;
 using static System.Net.Mime.MediaTypeNames;
-using static MuPDF.NET.Global;
 
 namespace MuPDF.NET
 {
@@ -1623,40 +1622,47 @@ public static List<Table> GetTables(
             List<Line> add_lines = null
         )
         {
-            if (strategy != null)
-            {
-                vertical_strategy = strategy;
-                horizontal_strategy = strategy;
-            }
-
-            Dictionary<string, object> settings = new Dictionary<string, object>
-            {
-                { "vertical_strategy", vertical_strategy },
-                { "horizontal_strategy", horizontal_strategy },
-                { "explicit_vertical_lines", vertical_lines },
-                { "explicit_horizontal_lines", horizontal_lines },
-                { "snap_tolerance", snap_tolerance },
-                { "snap_x_tolerance", snap_x_tolerance },
-                { "snap_y_tolerance", snap_y_tolerance },
-                { "join_tolerance", join_tolerance },
-                { "join_x_tolerance", join_x_tolerance },
-                { "join_y_tolerance", join_y_tolerance },
-                { "edge_min_length", edge_min_length },
-                { "min_words_vertical", min_words_vertical },
-                { "min_words_horizontal", min_words_horizontal },
-                { "intersection_tolerance", intersection_tolerance },
-                { "intersection_x_tolerance", intersection_x_tolerance },
-                { "intersection_y_tolerance", intersection_y_tolerance },
-                { "text_tolerance", text_tolerance },
-                { "text_x_tolerance", text_x_tolerance },
-                { "text_y_tolerance", text_y_tolerance }
-            };
+            // Convert List<Edge> to List<object> for FindTables
+            List<object> verticalLinesObj = vertical_lines?.Cast<object>().ToList();
+            List<object> horizontalLinesObj = horizontal_lines?.Cast<object>().ToList();
+
+            // Note: add_lines parameter in GetTables is List<Line> (text lines), but FindTables expects
+            // List<Tuple<Point, Point>> (geometric line segments). Since Line is a text line structure
+            // and not a geometric line, we cannot properly convert it. Pass null for now.
+            // If geometric line segments are needed, they should be passed directly to FindTables.
+            List<Tuple<Point, Point>> addLinesTuple = null;
+
+            // Call FindTables with nullable tolerances (0.0f means use default, null means UNSET)
+            var finder = TableFinderHelper.FindTables(
+                page: page,
+                clip: clip,
+                vertical_strategy: vertical_strategy,
+                horizontal_strategy: horizontal_strategy,
+                vertical_lines: verticalLinesObj,
+                horizontal_lines: horizontalLinesObj,
+                snap_tolerance: snap_tolerance,
+                snap_x_tolerance: snap_x_tolerance == 0.0f ? (float?)null : snap_x_tolerance,
+                snap_y_tolerance: snap_y_tolerance == 0.0f ? (float?)null : snap_y_tolerance,
+                join_tolerance: join_tolerance,
+                join_x_tolerance: join_x_tolerance == 0.0f ? (float?)null : join_x_tolerance,
+                join_y_tolerance: join_y_tolerance == 0.0f ? (float?)null : join_y_tolerance,
+                edge_min_length: edge_min_length,
+                min_words_vertical: min_words_vertical,
+                min_words_horizontal: min_words_horizontal,
+                intersection_tolerance: intersection_tolerance,
+                intersection_x_tolerance: intersection_x_tolerance == 0.0f ? (float?)null : intersection_x_tolerance,
+                intersection_y_tolerance: intersection_y_tolerance == 0.0f ? (float?)null : intersection_y_tolerance,
+                text_tolerance: text_tolerance,
+                text_x_tolerance: text_x_tolerance,
+                text_y_tolerance: text_y_tolerance,
+                strategy: strategy,
+                add_lines: addLinesTuple
+            );
 
-            // Resolve settings
-            TableSettings tset = TableSettings.resolve(settings);
+            if (finder == null)
+                return new List<Table>();
 
-            List<Table> tables = TableFinder.FindTables(page, clip, tset);
-            return tables;
+            return finder.tables;
         }
 
         /// <summary>
diff --git a/MuPDF.NET4LLM.Test/IdentifyHeadersTest.cs b/MuPDF.NET4LLM.Test/IdentifyHeadersTest.cs
new file mode 100644
index 0000000..69f8f01
--- /dev/null
+++ b/MuPDF.NET4LLM.Test/IdentifyHeadersTest.cs
@@ -0,0 +1,90 @@
+using System.Collections.Generic;
+using MuPDF.NET;
+using MuPDF.NET4LLM.Helpers;
+
+namespace MuPDF.NET4LLM.Test
+{
+    [TestFixture]
+    public class IdentifyHeadersTest : LLMTestBase
+    {
+        [Test]
+        public void Constructor_WithValidDocument_CreatesInstance()
+        {
+            var doc = OpenTestDocument("Magazine.pdf");
+            try
+            {
+                var identifyHeaders = new IdentifyHeaders(doc);
+                Assert.That(identifyHeaders, Is.Not.Null);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void Constructor_WithFilePath_CreatesInstance()
+        {
+            string filePath = GetResourcePath("Magazine.pdf");
+            var identifyHeaders = new IdentifyHeaders(filePath);
+            Assert.That(identifyHeaders, Is.Not.Null);
+        }
+
+        [Test]
+        public void Constructor_WithMaxLevelsOutOfRange_ThrowsException()
+        {
+            string filePath = GetResourcePath("Magazine.pdf");
+            
+            Assert.Throws<ArgumentException>(() =>
+            {
+                new IdentifyHeaders(filePath, maxLevels: 0);
+            });
+            
+            Assert.Throws<ArgumentException>(() =>
+            {
+                new IdentifyHeaders(filePath, maxLevels: 7);
+            });
+        }
+
+        [Test]
+        public void Constructor_WithSpecificPages_Works()
+        {
+            var doc = OpenTestDocument("Magazine.pdf");
+            try
+            {
+                var identifyHeaders = new IdentifyHeaders(doc, pages: new List<int> { 0 });
+                Assert.That(identifyHeaders, Is.Not.Null);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void GetHeaderId_WithSmallFont_ReturnsEmpty()
+        {
+            var doc = OpenTestDocument("Magazine.pdf");
+            try
+            {
+                var identifyHeaders = new IdentifyHeaders(doc);
+                var page = doc[0];
+                
+                // Create a mock span with small font size
+                var span = new ExtendedSpan
+                {
+                    Size = 10.0f,
+                    Text = "Test"
+                };
+                
+                string headerId = identifyHeaders.GetHeaderId(span, page);
+                // Should return empty for body text
+                Assert.That(headerId, Is.Not.Null);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM.Test/LLMTestBase.cs b/MuPDF.NET4LLM.Test/LLMTestBase.cs
new file mode 100644
index 0000000..ed00ea0
--- /dev/null
+++ b/MuPDF.NET4LLM.Test/LLMTestBase.cs
@@ -0,0 +1,27 @@
+using System.IO;
+using MuPDF.NET;
+
+namespace MuPDF.NET4LLM.Test
+{
+    /// <summary>
+    /// Base class for MuPDF.NET4LLM tests
+    /// </summary>
+    public class LLMTestBase
+    {
+        protected string GetResourcePath(string relativePath)
+        {
+            // Get the test project directory
+            string testDir = Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location);
+            string projectDir = Path.GetFullPath(Path.Combine(testDir, "..", "..", ".."));
+            return Path.Combine(projectDir, "resources", relativePath);
+        }
+
+        protected Document OpenTestDocument(string relativePath)
+        {
+            string fullPath = GetResourcePath(relativePath);
+            if (!File.Exists(fullPath))
+                throw new FileNotFoundException($"Test resource not found: {fullPath}");
+            return new Document(fullPath);
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM.Test/MuPDF.NET4LLM.Test.csproj b/MuPDF.NET4LLM.Test/MuPDF.NET4LLM.Test.csproj
new file mode 100644
index 0000000..fbc8f3e
--- /dev/null
+++ b/MuPDF.NET4LLM.Test/MuPDF.NET4LLM.Test.csproj
@@ -0,0 +1,35 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>net8.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+
+    <IsPackable>false</IsPackable>
+    <IsTestProject>true</IsTestProject>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="coverlet.collector" Version="6.0.0" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.8.0" />
+    <PackageReference Include="NUnit" Version="3.14.0" />
+    <PackageReference Include="NUnit.Analyzers" Version="3.9.0" />
+    <PackageReference Include="NUnit3TestAdapter" Version="4.5.0" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\MuPDF.NET\MuPDF.NET.csproj" />
+    <ProjectReference Include="..\MuPDF.NET4LLM\MuPDF.NET4LLM.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Using Include="NUnit.Framework" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <None Update="resources\**\*">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+
+</Project>
diff --git a/MuPDF.NET4LLM.Test/MuPDF4LLMTest.cs b/MuPDF.NET4LLM.Test/MuPDF4LLMTest.cs
new file mode 100644
index 0000000..bd7c892
--- /dev/null
+++ b/MuPDF.NET4LLM.Test/MuPDF4LLMTest.cs
@@ -0,0 +1,199 @@
+using System;
+using System.Collections.Generic;
+using MuPDF.NET;
+using MuPDF.NET4LLM;
+using MuPDF.NET4LLM.Llama;
+
+namespace MuPDF.NET4LLM.Test
+{
+    [TestFixture]
+    public class MuPDF4LLMTest : LLMTestBase
+    {
+        [Test]
+        public void Version_ReturnsValidVersion()
+        {
+            string version = MuPDF4LLM.Version;
+            Assert.That(version, Is.Not.Null);
+            Assert.That(version, Is.Not.Empty);
+            Assert.That(version.Split('.').Length, Is.GreaterThanOrEqualTo(2));
+        }
+
+        [Test]
+        public void VersionTuple_ReturnsValidTuple()
+        {
+            var (major, minor, patch) = MuPDF4LLM.VersionTuple;
+            Assert.That(major, Is.GreaterThanOrEqualTo(0));
+            Assert.That(minor, Is.GreaterThanOrEqualTo(0));
+            Assert.That(patch, Is.GreaterThanOrEqualTo(0));
+        }
+
+        [Test]
+        public void LlamaMarkdownReader_ReturnsReader()
+        {
+            var reader = MuPDF4LLM.LlamaMarkdownReader();
+            Assert.That(reader, Is.Not.Null);
+            Assert.That(reader, Is.InstanceOf<PDFMarkdownReader>());
+        }
+
+        [Test]
+        public void LlamaMarkdownReader_WithMetaFilter_ReturnsReader()
+        {
+            Func<Dictionary<string, object>, Dictionary<string, object>> filter = 
+                (meta) => { meta["custom"] = "value"; return meta; };
+            
+            var reader = MuPDF4LLM.LlamaMarkdownReader(filter);
+            Assert.That(reader, Is.Not.Null);
+            Assert.That(reader.MetaFilter, Is.EqualTo(filter));
+        }
+
+        [Test]
+        public void ToMarkdown_WithValidDocument_ReturnsMarkdown()
+        {
+            var doc = OpenTestDocument("Magazine.pdf");
+            try
+            {
+                string markdown = MuPDF4LLM.ToMarkdown(
+                    doc,
+                    header: false,
+                    footer: false,
+                    showProgress: false,
+                    useOcr: false
+                );
+
+                Assert.That(markdown, Is.Not.Null);
+                Assert.That(markdown, Is.Not.Empty);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithSpecificPages_ReturnsMarkdown()
+        {
+            var doc = OpenTestDocument("Magazine.pdf");
+            try
+            {
+                string markdown = MuPDF4LLM.ToMarkdown(
+                    doc,
+                    pages: new List<int> { 0 },
+                    header: false,
+                    footer: false,
+                    showProgress: false,
+                    useOcr: false
+                );
+
+                Assert.That(markdown, Is.Not.Null);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WriteImagesAndEmbedImages_ThrowsException()
+        {
+            var doc = OpenTestDocument("columns.pdf");
+            try
+            {
+                Assert.Throws<ArgumentException>(() =>
+                {
+                    MuPDF4LLM.ToMarkdown(
+                        doc,
+                        writeImages: true,
+                        embedImages: true,
+                        showProgress: false,
+                        useOcr: false
+                    );
+                });
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToJson_WithValidDocument_ReturnsJson()
+        {
+            var doc = OpenTestDocument("columns.pdf");
+            try
+            {
+                string json = MuPDF4LLM.ToJson(
+                    doc,
+                    showProgress: false,
+                    useOcr: false
+                );
+
+                Assert.That(json, Is.Not.Null);
+                Assert.That(json, Is.Not.Empty);
+                Assert.That(json.TrimStart(), Does.StartWith("{"));
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToText_WithValidDocument_ReturnsText()
+        {
+            var doc = OpenTestDocument("columns.pdf");
+            try
+            {
+                string text = MuPDF4LLM.ToText(
+                    doc,
+                    header: false,
+                    footer: false,
+                    showProgress: false,
+                    useOcr: false
+                );
+
+                Assert.That(text, Is.Not.Null);
+                Assert.That(text, Is.Not.Empty);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ParseDocument_WithValidDocument_ReturnsParsedDocument()
+        {
+            var doc = OpenTestDocument("columns.pdf");
+            try
+            {
+                var parsedDoc = MuPDF4LLM.ParseDocument(
+                    doc,
+                    showProgress: false,
+                    useOcr: false
+                );
+
+                Assert.That(parsedDoc, Is.Not.Null);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void GetKeyValues_WithNonFormPDF_ReturnsEmptyDictionary()
+        {
+            var doc = OpenTestDocument("Magazine.pdf");
+            try
+            {
+                var keyValues = MuPDF4LLM.GetKeyValues(doc);
+                Assert.That(keyValues, Is.Not.Null);
+                Assert.That(keyValues, Is.Empty);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM.Test/MuPdfRagTest.cs b/MuPDF.NET4LLM.Test/MuPdfRagTest.cs
new file mode 100644
index 0000000..cc8fa71
--- /dev/null
+++ b/MuPDF.NET4LLM.Test/MuPdfRagTest.cs
@@ -0,0 +1,361 @@
+using System;
+using System.Collections.Generic;
+using MuPDF.NET;
+using MuPDF.NET4LLM.Helpers;
+using NUnit.Framework;
+
+namespace MuPDF.NET4LLM.Test
+{
+    [TestFixture]
+    public class MuPdfRagTest : LLMTestBase
+    {
+        [Test]
+        public void ToMarkdown_BasicWithDefaultSettings_ReturnsMarkdown()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                string markdown = MuPdfRag.ToMarkdown(
+                    doc,
+                    pages: null, // All pages
+                    hdrInfo: null, // Auto-detect headers
+                    writeImages: false,
+                    embedImages: false,
+                    ignoreImages: false,
+                    ignoreGraphics: false,
+                    detectBgColor: true,
+                    imagePath: "",
+                    imageFormat: "png",
+                    imageSizeLimit: 0.05f,
+                    filename: GetResourcePath("national-capitals.pdf"),
+                    forceText: true,
+                    pageChunks: false,
+                    pageSeparators: false,
+                    margins: null,
+                    dpi: 150,
+                    pageWidth: 612,
+                    pageHeight: null,
+                    tableStrategy: "lines_strict",
+                    graphicsLimit: null,
+                    fontsizeLimit: 3.0f,
+                    ignoreCode: false,
+                    extractWords: false,
+                    showProgress: false,
+                    useGlyphs: false,
+                    ignoreAlpha: false
+                );
+
+                Assert.That(markdown, Is.Not.Null);
+                Assert.That(markdown, Is.Not.Empty);
+                Assert.That(markdown.Length, Is.GreaterThan(0));
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithIdentifyHeaders_ReturnsMarkdown()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                var identifyHeaders = new IdentifyHeaders(doc, pages: null, bodyLimit: 12.0f, maxLevels: 6);
+                
+                string markdown = MuPdfRag.ToMarkdown(
+                    doc,
+                    pages: new List<int> { 0 }, // First page only
+                    hdrInfo: identifyHeaders,
+                    writeImages: false,
+                    embedImages: false,
+                    ignoreImages: false,
+                    filename: GetResourcePath("national-capitals.pdf"),
+                    forceText: true,
+                    showProgress: false
+                );
+
+                Assert.That(markdown, Is.Not.Null);
+                Assert.That(markdown, Is.Not.Empty);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithTocHeaders_ReturnsMarkdown()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                var tocHeaders = new TocHeaders(doc);
+                
+                string markdown = MuPdfRag.ToMarkdown(
+                    doc,
+                    pages: new List<int> { 0 }, // First page only
+                    hdrInfo: tocHeaders,
+                    writeImages: false,
+                    embedImages: false,
+                    ignoreImages: false,
+                    filename: GetResourcePath("national-capitals.pdf"),
+                    forceText: true,
+                    showProgress: false
+                );
+
+                Assert.That(markdown, Is.Not.Null);
+                Assert.That(markdown, Is.Not.Empty);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithPageSeparators_ReturnsMarkdown()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                string markdown = MuPdfRag.ToMarkdown(
+                    doc,
+                    pages: null, // All pages
+                    hdrInfo: null,
+                    writeImages: false,
+                    embedImages: false,
+                    ignoreImages: false,
+                    filename: GetResourcePath("national-capitals.pdf"),
+                    forceText: true,
+                    pageSeparators: true, // Add page separators
+                    showProgress: false
+                );
+
+                Assert.That(markdown, Is.Not.Null);
+                Assert.That(markdown, Is.Not.Empty);
+                
+                // Verify page separators are present
+                Assert.That(markdown, Does.Contain("--- end of page="));
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithSpecificPages_ReturnsMarkdown()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                string markdown = MuPdfRag.ToMarkdown(
+                    doc,
+                    pages: new List<int> { 0, 1 }, // First two pages
+                    hdrInfo: null,
+                    writeImages: false,
+                    embedImages: false,
+                    ignoreImages: false,
+                    filename: GetResourcePath("national-capitals.pdf"),
+                    forceText: true,
+                    showProgress: false
+                );
+
+                Assert.That(markdown, Is.Not.Null);
+                Assert.That(markdown, Is.Not.Empty);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithInvalidImageSizeLimit_ThrowsException()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                Assert.Throws<ArgumentOutOfRangeException>(() =>
+                {
+                    MuPdfRag.ToMarkdown(
+                        doc,
+                        imageSizeLimit: 1.5f, // Invalid: >= 1
+                        filename: GetResourcePath("national-capitals.pdf"),
+                        forceText: true,
+                        showProgress: false
+                    );
+                });
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithMargins_ReturnsMarkdown()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                string markdown = MuPdfRag.ToMarkdown(
+                    doc,
+                    pages: new List<int> { 0 },
+                    hdrInfo: null,
+                    writeImages: false,
+                    embedImages: false,
+                    ignoreImages: false,
+                    filename: GetResourcePath("national-capitals.pdf"),
+                    forceText: true,
+                    margins: new List<float> { 10.0f, 20.0f, 10.0f, 20.0f }, // left, top, right, bottom
+                    showProgress: false
+                );
+
+                Assert.That(markdown, Is.Not.Null);
+                Assert.That(markdown, Is.Not.Empty);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithInvalidMargins_ThrowsException()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                Assert.Throws<ArgumentException>(() =>
+                {
+                    MuPdfRag.ToMarkdown(
+                        doc,
+                        margins: new List<float> { 10.0f, 20.0f, 30.0f }, // Invalid: 3 elements
+                        filename: GetResourcePath("national-capitals.pdf"),
+                        forceText: true,
+                        showProgress: false
+                    );
+                });
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithTableStrategy_ReturnsMarkdown()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                string markdown = MuPdfRag.ToMarkdown(
+                    doc,
+                    pages: new List<int> { 0 },
+                    hdrInfo: null,
+                    writeImages: false,
+                    embedImages: false,
+                    ignoreImages: false,
+                    filename: GetResourcePath("national-capitals.pdf"),
+                    forceText: true,
+                    tableStrategy: "lines",
+                    showProgress: false
+                );
+
+                Assert.That(markdown, Is.Not.Null);
+                Assert.That(markdown, Is.Not.Empty);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithIgnoreImages_ReturnsMarkdown()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                string markdown = MuPdfRag.ToMarkdown(
+                    doc,
+                    pages: new List<int> { 0 },
+                    hdrInfo: null,
+                    writeImages: false,
+                    embedImages: false,
+                    ignoreImages: true,
+                    filename: GetResourcePath("national-capitals.pdf"),
+                    forceText: true,
+                    showProgress: false
+                );
+
+                Assert.That(markdown, Is.Not.Null);
+                Assert.That(markdown, Is.Not.Empty);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithIgnoreGraphics_ReturnsMarkdown()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                string markdown = MuPdfRag.ToMarkdown(
+                    doc,
+                    pages: new List<int> { 0 },
+                    hdrInfo: null,
+                    writeImages: false,
+                    embedImages: false,
+                    ignoreImages: false,
+                    ignoreGraphics: true,
+                    filename: GetResourcePath("national-capitals.pdf"),
+                    forceText: true,
+                    showProgress: false
+                );
+
+                Assert.That(markdown, Is.Not.Null);
+                Assert.That(markdown, Is.Not.Empty);
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+
+        [Test]
+        public void ToMarkdown_WithPageChunks_ReturnsJson()
+        {
+            var doc = OpenTestDocument("national-capitals.pdf");
+            try
+            {
+                string result = MuPdfRag.ToMarkdown(
+                    doc,
+                    pages: new List<int> { 0 },
+                    hdrInfo: null,
+                    writeImages: false,
+                    embedImages: false,
+                    ignoreImages: false,
+                    filename: GetResourcePath("national-capitals.pdf"),
+                    forceText: true,
+                    pageChunks: true,
+                    showProgress: false
+                );
+
+                Assert.That(result, Is.Not.Null);
+                Assert.That(result, Is.Not.Empty);
+                // In page_chunks mode, result should be JSON or structured text
+                Assert.That(result.Length, Is.GreaterThan(0));
+            }
+            finally
+            {
+                doc.Close();
+            }
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM.Test/PDFMarkdownReaderTest.cs b/MuPDF.NET4LLM.Test/PDFMarkdownReaderTest.cs
new file mode 100644
index 0000000..4560083
--- /dev/null
+++ b/MuPDF.NET4LLM.Test/PDFMarkdownReaderTest.cs
@@ -0,0 +1,163 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using MuPDF.NET;
+using MuPDF.NET4LLM.Llama;
+
+namespace MuPDF.NET4LLM.Test
+{
+    [TestFixture]
+    public class PDFMarkdownReaderTest : LLMTestBase
+    {
+        [Test]
+        public void Constructor_WithoutMetaFilter_CreatesReader()
+        {
+            var reader = new PDFMarkdownReader();
+            Assert.That(reader, Is.Not.Null);
+            Assert.That(reader.MetaFilter, Is.Null);
+        }
+
+        [Test]
+        public void Constructor_WithMetaFilter_CreatesReader()
+        {
+            Func<Dictionary<string, object>, Dictionary<string, object>> filter = 
+                (meta) => meta;
+            
+            var reader = new PDFMarkdownReader(filter);
+            Assert.That(reader, Is.Not.Null);
+            Assert.That(reader.MetaFilter, Is.EqualTo(filter));
+        }
+
+        [Test]
+        public void LoadData_WithNullFilePath_ThrowsArgumentNullException()
+        {
+            var reader = new PDFMarkdownReader();
+            Assert.Throws<ArgumentNullException>(() =>
+            {
+                reader.LoadData(null);
+            });
+        }
+
+        [Test]
+        public void LoadData_WithNonExistentFile_ThrowsFileNotFoundException()
+        {
+            var reader = new PDFMarkdownReader();
+            Assert.Throws<FileNotFoundException>(() =>
+            {
+                reader.LoadData("nonexistent.pdf");
+            });
+        }
+
+        [Test]
+        public void LoadData_WithValidFile_ReturnsDocuments()
+        {
+            var reader = new PDFMarkdownReader();
+            string filePath = GetResourcePath("columns.pdf");
+            
+            var docs = reader.LoadData(filePath);
+            
+            Assert.That(docs, Is.Not.Null);
+            Assert.That(docs.Count, Is.GreaterThan(0));
+        }
+
+        [Test]
+        public void LoadData_WithValidFile_ReturnsDocumentsWithText()
+        {
+            var reader = new PDFMarkdownReader();
+            string filePath = GetResourcePath("columns.pdf");
+            
+            var docs = reader.LoadData(filePath);
+            
+            Assert.That(docs.Count, Is.GreaterThan(0));
+            foreach (var doc in docs)
+            {
+                Assert.That(doc, Is.Not.Null);
+                Assert.That(doc.Text, Is.Not.Null);
+                Assert.That(doc.ExtraInfo, Is.Not.Null);
+            }
+        }
+
+        [Test]
+        public void LoadData_WithExtraInfo_IncludesExtraInfo()
+        {
+            var reader = new PDFMarkdownReader();
+            string filePath = GetResourcePath("columns.pdf");
+            var extraInfo = new Dictionary<string, object>
+            {
+                { "custom_key", "custom_value" }
+            };
+            
+            var docs = reader.LoadData(filePath, extraInfo: extraInfo);
+            
+            Assert.That(docs.Count, Is.GreaterThan(0));
+            Assert.That(docs[0].ExtraInfo.ContainsKey("custom_key"), Is.True);
+            Assert.That(docs[0].ExtraInfo["custom_key"], Is.EqualTo("custom_value"));
+        }
+
+        [Test]
+        public void LoadData_WithMetaFilter_AppliesFilter()
+        {
+            bool filterCalled = false;
+            Func<Dictionary<string, object>, Dictionary<string, object>> filter = 
+                (meta) =>
+                {
+                    filterCalled = true;
+                    meta["filtered"] = true;
+                    return meta;
+                };
+            
+            var reader = new PDFMarkdownReader(filter);
+            string filePath = GetResourcePath("columns.pdf");
+            
+            var docs = reader.LoadData(filePath);
+            
+            Assert.That(filterCalled, Is.True);
+            Assert.That(docs.Count, Is.GreaterThan(0));
+            Assert.That(docs[0].ExtraInfo.ContainsKey("filtered"), Is.True);
+            Assert.That(docs[0].ExtraInfo["filtered"], Is.EqualTo(true));
+        }
+
+        [Test]
+        public void LoadData_WithLoadKwargs_RespectsKwargs()
+        {
+            var reader = new PDFMarkdownReader();
+            string filePath = GetResourcePath("columns.pdf");
+            var loadKwargs = new Dictionary<string, object>
+            {
+                { "force_text", true },
+                { "write_images", false },
+                { "embed_images", false }
+            };
+            
+            var docs = reader.LoadData(filePath, loadKwargs: loadKwargs);
+            
+            Assert.That(docs, Is.Not.Null);
+            Assert.That(docs.Count, Is.GreaterThan(0));
+        }
+
+        [Test]
+        public void LoadData_WithStringPath_Works()
+        {
+            var reader = new PDFMarkdownReader();
+            string filePath = GetResourcePath("columns.pdf");
+            
+            var docs = reader.LoadData(filePath);
+            
+            Assert.That(docs, Is.Not.Null);
+        }
+
+        [Test]
+        public void LoadData_IncludesPageMetadata()
+        {
+            var reader = new PDFMarkdownReader();
+            string filePath = GetResourcePath("columns.pdf");
+            
+            var docs = reader.LoadData(filePath);
+            
+            Assert.That(docs.Count, Is.GreaterThan(0));
+            Assert.That(docs[0].ExtraInfo.ContainsKey("page"), Is.True);
+            Assert.That(docs[0].ExtraInfo.ContainsKey("total_pages"), Is.True);
+            Assert.That(docs[0].ExtraInfo.ContainsKey("file_path"), Is.True);
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM.Test/README.md b/MuPDF.NET4LLM.Test/README.md
new file mode 100644
index 0000000..0a24c2a
--- /dev/null
+++ b/MuPDF.NET4LLM.Test/README.md
@@ -0,0 +1,52 @@
+# MuPDF.NET4LLM.Test
+
+Unit tests for the MuPDF.NET4LLM project.
+
+## Test Structure
+
+The test project follows the same structure as `MuPDF.NET.Test` and uses NUnit as the testing framework.
+
+## Test Classes
+
+- **MuPDF4LLMTest**: Tests for the main `MuPDF4LLM` static class
+  - Version information
+  - Document conversion methods (ToMarkdown, ToJson, ToText)
+  - LlamaIndex reader creation
+  - Error handling
+
+- **PDFMarkdownReaderTest**: Tests for the `PDFMarkdownReader` class
+  - Constructor tests
+  - LoadData method with various parameters
+  - MetaFilter functionality
+  - Error handling
+
+- **UtilsTest**: Tests for utility functions
+  - White character detection
+  - Bullet character detection
+  - Constants validation
+
+- **IdentifyHeadersTest**: Tests for header identification
+  - Constructor with various parameters
+  - Header ID generation
+  - Error handling
+
+- **VersionInfoTest**: Tests for version information
+  - Version string validation
+  - Minimum MuPDF version validation
+
+## Running Tests
+
+Tests can be run using:
+- Visual Studio Test Explorer
+- `dotnet test` command
+- NUnit Test Adapter
+
+## Test Resources
+
+Test resources (PDF files) should be placed in the `resources` directory. The `columns.pdf` file is used as a sample test document.
+
+## Notes
+
+- Tests that require OCR are disabled by default (`useOcr: false`) to avoid dependencies on OCR libraries
+- Some tests may require specific PDF files in the resources directory
+- Tests follow the Arrange-Act-Assert pattern
diff --git a/MuPDF.NET4LLM.Test/UtilsTest.cs b/MuPDF.NET4LLM.Test/UtilsTest.cs
new file mode 100644
index 0000000..7a93e4a
--- /dev/null
+++ b/MuPDF.NET4LLM.Test/UtilsTest.cs
@@ -0,0 +1,57 @@
+using System;
+using MuPDF4LLMUtils = MuPDF.NET4LLM.Helpers.Utils;
+
+namespace MuPDF.NET4LLM.Test
+{
+    [TestFixture]
+    public class UtilsTest
+    {
+        [Test]
+        public void WhiteChars_ContainsExpectedCharacters()
+        {
+            Assert.That(MuPDF4LLMUtils.WHITE_CHARS.Contains(' '), Is.True);
+            Assert.That(MuPDF4LLMUtils.WHITE_CHARS.Contains('\t'), Is.True);
+            Assert.That(MuPDF4LLMUtils.WHITE_CHARS.Contains('\n'), Is.True);
+            Assert.That(MuPDF4LLMUtils.WHITE_CHARS.Contains('\u00a0'), Is.True); // Non-breaking space
+            Assert.That(MuPDF4LLMUtils.WHITE_CHARS.Contains('a'), Is.False);
+        }
+
+        [Test]
+        public void IsWhite_WithWhiteString_ReturnsTrue()
+        {
+            Assert.That(MuPDF4LLMUtils.IsWhite("   "), Is.True);
+            Assert.That(MuPDF4LLMUtils.IsWhite("\t\n"), Is.True);
+            Assert.That(MuPDF4LLMUtils.IsWhite("\u00a0"), Is.True); // Non-breaking space
+            Assert.That(MuPDF4LLMUtils.IsWhite(""), Is.True);
+        }
+
+        [Test]
+        public void IsWhite_WithNonWhiteString_ReturnsFalse()
+        {
+            Assert.That(MuPDF4LLMUtils.IsWhite("hello"), Is.False);
+            Assert.That(MuPDF4LLMUtils.IsWhite("  hello  "), Is.False);
+            Assert.That(MuPDF4LLMUtils.IsWhite("a"), Is.False);
+        }
+
+        [Test]
+        public void Bullets_ContainsExpectedCharacters()
+        {
+            Assert.That(MuPDF4LLMUtils.BULLETS.Contains('*'), Is.True);
+            Assert.That(MuPDF4LLMUtils.BULLETS.Contains('-'), Is.True);
+            Assert.That(MuPDF4LLMUtils.BULLETS.Contains('>'), Is.True);
+            Assert.That(MuPDF4LLMUtils.BULLETS.Contains('o'), Is.True);
+        }
+
+        [Test]
+        public void ReplacementCharacter_IsCorrect()
+        {
+            Assert.That(MuPDF4LLMUtils.REPLACEMENT_CHARACTER, Is.EqualTo('\uFFFD'));
+        }
+
+        [Test]
+        public void Type3FontName_IsCorrect()
+        {
+            Assert.That(MuPDF4LLMUtils.TYPE3_FONT_NAME, Is.EqualTo("Unnamed-T3"));
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM.Test/VersionInfoTest.cs b/MuPDF.NET4LLM.Test/VersionInfoTest.cs
new file mode 100644
index 0000000..a70b859
--- /dev/null
+++ b/MuPDF.NET4LLM.Test/VersionInfoTest.cs
@@ -0,0 +1,24 @@
+using MuPDF.NET4LLM;
+
+namespace MuPDF.NET4LLM.Test
+{
+    [TestFixture]
+    public class VersionInfoTest
+    {
+        [Test]
+        public void Version_IsNotNull()
+        {
+            Assert.That(VersionInfo.Version, Is.Not.Null);
+            Assert.That(VersionInfo.Version, Is.Not.Empty);
+        }
+
+        [Test]
+        public void MinimumMuPDFVersion_IsValid()
+        {
+            var (major, minor, patch) = VersionInfo.MinimumMuPDFVersion;
+            Assert.That(major, Is.GreaterThanOrEqualTo(1));
+            Assert.That(minor, Is.GreaterThanOrEqualTo(0));
+            Assert.That(patch, Is.GreaterThanOrEqualTo(0));
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM.Test/resources/Magazine.pdf b/MuPDF.NET4LLM.Test/resources/Magazine.pdf
new file mode 100644
index 0000000..c8e166e
Binary files /dev/null and b/MuPDF.NET4LLM.Test/resources/Magazine.pdf differ
diff --git a/MuPDF.NET4LLM.Test/resources/columns.pdf b/MuPDF.NET4LLM.Test/resources/columns.pdf
new file mode 100644
index 0000000..18f5f15
Binary files /dev/null and b/MuPDF.NET4LLM.Test/resources/columns.pdf differ
diff --git a/MuPDF.NET4LLM.Test/resources/national-capitals.pdf b/MuPDF.NET4LLM.Test/resources/national-capitals.pdf
new file mode 100644
index 0000000..d2b4721
Binary files /dev/null and b/MuPDF.NET4LLM.Test/resources/national-capitals.pdf differ
diff --git a/MuPDF.NET4LLM/MuPDF.NET4LLM.csproj b/MuPDF.NET4LLM/MuPDF.NET4LLM.csproj
new file mode 100644
index 0000000..7a76650
--- /dev/null
+++ b/MuPDF.NET4LLM/MuPDF.NET4LLM.csproj
@@ -0,0 +1,27 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFrameworks>netstandard2.0;net461;net472;net48;net5.0;net6.0;net7.0;net8.0</TargetFrameworks>
+    <Platforms>AnyCPU;x64;x86</Platforms>
+    <GeneratePackageOnBuild>false</GeneratePackageOnBuild>
+    <PlatformFolder>$(Platform)</PlatformFolder>
+    <PlatformFolder Condition="'$(PlatformFolder)' == 'AnyCPU'">.</PlatformFolder>
+  </PropertyGroup>
+  
+  <PropertyGroup Condition="'$(OS)' == 'Windows_NT'">
+    <DefineConstants>WINDOWS</DefineConstants>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(OS)' != 'Windows_NT'">
+    <DefineConstants>LINUX</DefineConstants>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\MuPDF.NET\MuPDF.NET.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Newtonsoft.Json" Version="13.0.4" />
+  </ItemGroup>
+
+</Project>
diff --git a/MuPDF.NET4LLM/MuPDF4LLM.cs b/MuPDF.NET4LLM/MuPDF4LLM.cs
new file mode 100644
index 0000000..a633504
--- /dev/null
+++ b/MuPDF.NET4LLM/MuPDF4LLM.cs
@@ -0,0 +1,290 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using MuPDF.NET;
+using MuPDF.NET4LLM.Helpers;
+using MuPDF.NET4LLM.Llama;
+
+namespace MuPDF.NET4LLM
+{
+    /// <summary>
+    /// Main entry point for MuPDF.NET4LLM functionality.
+    /// Provides a C# facade over the helpers ported from the Python pymupdf4llm package.
+    /// </summary>
+    public static class MuPDF4LLM
+    {
+        public static string Version => VersionInfo.Version;
+
+        public static (int major, int minor, int patch) VersionTuple
+        {
+            get
+            {
+                string[] parts = Version.Split('.');
+                return (
+                    int.Parse(parts[0]),
+                    parts.Length > 1 ? int.Parse(parts[1]) : 0,
+                    parts.Length > 2 ? int.Parse(parts[2]) : 0
+                );
+            }
+        }
+
+        /// <summary>
+        /// Get a LlamaIndex‑compatible PDF reader that uses <see cref="MuPdfRag"/>
+        /// under the hood to produce Markdown text per page.
+        /// </summary>
+        public static PDFMarkdownReader LlamaMarkdownReader(
+            Func<Dictionary<string, object>, Dictionary<string, object>> metaFilter = null)
+        {
+            return new PDFMarkdownReader(metaFilter);
+        }
+
+        /// <summary>
+        /// Process the document and return the text of the selected pages.
+        /// </summary>
+        /// <param name="doc">Input <see cref="Document"/> to convert.</param>
+        /// <param name="header">Include page headers in output.</param>
+        /// <param name="footer">Include page footers in output.</param>
+        /// <param name="pages">List of page numbers to consider (0-based).</param>
+        /// <param name="writeImages">Save images / graphics as files.</param>
+        /// <param name="embedImages">Embed images in markdown text (base64 encoded).</param>
+        /// <param name="imagePath">Store images in this folder.</param>
+        /// <param name="imageFormat">Use this image format. Choose a supported one (e.g. "png", "jpg").</param>
+        /// <param name="filename">Logical filename used in image names and metadata.</param>
+        /// <param name="forceText">Output text despite of image background.</param>
+        /// <param name="pageChunks">Whether to segment output by page.</param>
+        /// <param name="pageSeparators">Whether to include page separators in output.</param>
+        /// <param name="dpi">Desired resolution for generated images.</param>
+        /// <param name="ocrDpi">DPI for OCR operations.</param>
+        /// <param name="pageWidth">Assumption if page layout is variable (reflowable documents).</param>
+        /// <param name="pageHeight">Assumption if page layout is variable (reflowable documents). If null, a single tall page is created.</param>
+        /// <param name="ignoreCode">Suppress code-like formatting (mono-space fonts).</param>
+        /// <param name="showProgress">Print progress as each page is processed.</param>
+        /// <param name="useOcr">If beneficial invoke OCR.</param>
+        /// <param name="ocrLanguage">Language for OCR.</param>
+        public static string ToMarkdown(
+            Document doc,
+            bool header = true,
+            bool footer = true,
+            List<int> pages = null,
+            bool writeImages = false,
+            bool embedImages = false,
+            string imagePath = "",
+            string imageFormat = "png",
+            string filename = "",
+            bool forceText = true,
+            bool pageChunks = false,
+            bool pageSeparators = false,
+            int dpi = 150,
+            int ocrDpi = 400,
+            float pageWidth = 612,
+            float? pageHeight = null,
+            bool ignoreCode = false,
+            bool showProgress = false,
+            bool useOcr = true,
+            string ocrLanguage = "eng")
+        {
+            if (writeImages && embedImages)
+                throw new ArgumentException("Cannot both write_images and embed_images");
+
+            var parsedDoc = Helpers.DocumentLayout.ParseDocument(
+                doc,
+                filename: filename,
+                imageDpi: dpi,
+                imageFormat: imageFormat,
+                imagePath: imagePath,
+                pages: pages,
+                ocrDpi: ocrDpi,
+                writeImages: writeImages,
+                embedImages: embedImages,
+                showProgress: showProgress,
+                forceText: forceText,
+                useOcr: useOcr,
+                ocrLanguage: ocrLanguage);
+
+            return parsedDoc.ToMarkdown(
+                header: header,
+                footer: footer,
+                writeImages: writeImages,
+                embedImages: embedImages,
+                ignoreCode: ignoreCode,
+                showProgress: showProgress,
+                pageSeparators: pageSeparators,
+                pageChunks: pageChunks);
+        }
+
+        /// <summary>
+        /// High‑level helper to convert a <see cref="Document"/> to a JSON representation
+        /// of its layout (pages, boxes, metadata). Wraps
+        /// <see cref="Helpers.DocumentLayout.ParseDocument"/> and
+        /// <see cref="Helpers.ParsedDocument.ToJson"/>.
+        /// </summary>
+        /// <param name="doc">Input <see cref="Document"/> to convert.</param>
+        /// <param name="imageDpi">Desired resolution for generated images.</param>
+        /// <param name="imageFormat">Use this image format.</param>
+        /// <param name="imagePath">Store images in this folder.</param>
+        /// <param name="pages">List of page numbers to consider (0-based).</param>
+        /// <param name="ocrDpi">DPI for OCR operations.</param>
+        /// <param name="writeImages">Save images / graphics as files.</param>
+        /// <param name="embedImages">Embed images in JSON (base64 encoded).</param>
+        /// <param name="showProgress">Print progress as each page is processed.</param>
+        /// <param name="forceText">Output text despite of image background.</param>
+        /// <param name="useOcr">If beneficial invoke OCR.</param>
+        /// <param name="ocrLanguage">Language for OCR.</param>
+        public static string ToJson(
+            Document doc,
+            int imageDpi = 150,
+            string imageFormat = "png",
+            string imagePath = "",
+            List<int> pages = null,
+            int ocrDpi = 400,
+            bool writeImages = false,
+            bool embedImages = false,
+            bool showProgress = false,
+            bool forceText = true,
+            bool useOcr = true,
+            string ocrLanguage = "eng")
+        {
+            var parsedDoc = Helpers.DocumentLayout.ParseDocument(
+                doc,
+                filename: doc.Name,
+                imageDpi: imageDpi,
+                ocrDpi: ocrDpi,
+                imageFormat: imageFormat,
+                imagePath: imagePath,
+                pages: pages,
+                showProgress: showProgress,
+                embedImages: embedImages,
+                writeImages: writeImages,
+                forceText: forceText,
+                useOcr: useOcr,
+                ocrLanguage: ocrLanguage);
+
+            return parsedDoc.ToJson();
+        }
+
+        /// <summary>
+        /// High‑level helper to convert a <see cref="Document"/> to plain text, using the
+        /// same layout analysis as the Markdown conversion but omitting Markdown syntax.
+        /// Wraps <see cref="Helpers.DocumentLayout.ParseDocument"/> and
+        /// <see cref="Helpers.ParsedDocument.ToText"/>.
+        /// </summary>
+        /// <param name="doc">Input <see cref="Document"/> to convert.</param>
+        /// <param name="filename">Logical filename used in metadata.</param>
+        /// <param name="header">Include page headers in output.</param>
+        /// <param name="footer">Include page footers in output.</param>
+        /// <param name="pages">List of page numbers to consider (0-based).</param>
+        /// <param name="ignoreCode">Suppress code-like formatting.</param>
+        /// <param name="showProgress">Print progress as each page is processed.</param>
+        /// <param name="forceText">Output text despite of image background.</param>
+        /// <param name="ocrDpi">DPI for OCR operations.</param>
+        /// <param name="useOcr">If beneficial invoke OCR.</param>
+        /// <param name="ocrLanguage">Language for OCR.</param>
+        /// <param name="tableFormat">Table format for text output (e.g. "grid").</param>
+        /// <param name="pageChunks">Whether to segment output by page.</param>
+        public static string ToText(
+            Document doc,
+            string filename = "",
+            bool header = true,
+            bool footer = true,
+            List<int> pages = null,
+            bool ignoreCode = false,
+            bool showProgress = false,
+            bool forceText = true,
+            int ocrDpi = 400,
+            bool useOcr = true,
+            string ocrLanguage = "eng",
+            string tableFormat = "grid",
+            bool pageChunks = false)
+        {
+            var parsedDoc = Helpers.DocumentLayout.ParseDocument(
+                doc,
+                filename: filename,
+                pages: pages,
+                embedImages: false,
+                writeImages: false,
+                showProgress: showProgress,
+                forceText: forceText,
+                useOcr: useOcr,
+                ocrLanguage: ocrLanguage);
+
+            return parsedDoc.ToText(
+                header: header,
+                footer: footer,
+                ignoreCode: ignoreCode,
+                showProgress: showProgress,
+                tableFormat: tableFormat,
+                pageChunks: pageChunks);
+        }
+
+        /// <summary>
+        /// Parse the logical layout of a <see cref="Document"/> into a
+        /// <see cref="Helpers.ParsedDocument"/> object, exposing pages, layout boxes,
+        /// tables, images and metadata. This is the C# equivalent of the Python
+        /// <c>parse_document</c> helper and is the common basis for Markdown / JSON / text output.
+        /// </summary>
+        /// <param name="doc">Input <see cref="Document"/> to convert.</param>
+        /// <param name="filename">Logical filename used in metadata.</param>
+        /// <param name="imageDpi">Desired resolution for generated images.</param>
+        /// <param name="imageFormat">Use this image format.</param>
+        /// <param name="imagePath">Store images in this folder.</param>
+        /// <param name="ocrDpi">DPI for OCR operations.</param>
+        /// <param name="pages">List of page numbers to consider (0-based).</param>
+        /// <param name="writeImages">Save images / graphics as files.</param>
+        /// <param name="embedImages">Embed images (base64 encoded).</param>
+        /// <param name="showProgress">Print progress as each page is processed.</param>
+        /// <param name="forceText">Output text despite of image background.</param>
+        /// <param name="useOcr">If beneficial invoke OCR.</param>
+        /// <param name="ocrLanguage">Language for OCR.</param>
+        public static Helpers.ParsedDocument ParseDocument(
+            Document doc,
+            string filename = "",
+            int imageDpi = 150,
+            string imageFormat = "png",
+            string imagePath = "",
+            int ocrDpi = 400,
+            List<int> pages = null,
+            bool writeImages = false,
+            bool embedImages = false,
+            bool showProgress = false,
+            bool forceText = true,
+            bool useOcr = true,
+            string ocrLanguage = "eng")
+        {
+            return Helpers.DocumentLayout.ParseDocument(
+                doc,
+                filename: filename,
+                imageDpi: imageDpi,
+                imageFormat: imageFormat,
+                imagePath: imagePath,
+                ocrDpi: ocrDpi,
+                pages: pages,
+                writeImages: writeImages,
+                embedImages: embedImages,
+                showProgress: showProgress,
+                forceText: forceText,
+                useOcr: useOcr,
+                ocrLanguage: ocrLanguage);
+        }
+
+        /// <summary>
+        /// Extract key / value information from interactive form fields, including
+        /// the pages each field appears on, similar to the Python
+        /// <c>utils.extract_form_fields_with_pages</c> helper.
+        /// Traverse /AcroForm/Fields hierarchy and return a dict:
+        /// fully qualified field name -> {"value": ..., "pages": [...]}
+        /// Optionally, the xref of the field is included.
+        /// </summary>
+        /// <param name="doc">Input <see cref="Document"/>.</param>
+        /// <param name="xrefs">Include the xref of the field.</param>
+        public static Dictionary<string, Dictionary<string, object>> GetKeyValues(
+            Document doc,
+            bool xrefs = false)
+        {
+            if (doc.IsFormPDF != 0)
+            {
+                return Helpers.Utils.ExtractFormFieldsWithPages(doc, xrefs);
+            }
+            return new Dictionary<string, Dictionary<string, object>>();
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM/VersionInfo.cs b/MuPDF.NET4LLM/VersionInfo.cs
new file mode 100644
index 0000000..fd4eba9
--- /dev/null
+++ b/MuPDF.NET4LLM/VersionInfo.cs
@@ -0,0 +1,12 @@
+namespace MuPDF.NET4LLM
+{
+    /// <summary>
+    /// Version information for MuPDF.NET4LLM
+    /// Generated file - do not edit.
+    /// </summary>
+    public static class VersionInfo
+    {
+        public static readonly (int Major, int Minor, int Patch) MinimumMuPDFVersion = (1, 27, 0);
+        public const string Version = "0.2.9";
+    }
+}
diff --git a/MuPDF.NET4LLM/helpers/CheckOcr.cs b/MuPDF.NET4LLM/helpers/CheckOcr.cs
new file mode 100644
index 0000000..465c686
--- /dev/null
+++ b/MuPDF.NET4LLM/helpers/CheckOcr.cs
@@ -0,0 +1,313 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using MuPDF.NET;
+using mupdf;
+using Char = MuPDF.NET.Char;
+
+namespace MuPDF.NET4LLM.Helpers
+{
+    /// <summary>
+    /// OCR decision and repair utilities.
+    /// Ported and adapted from the Python module helpers/check_ocr.py in pymupdf4llm.
+    /// </summary>
+    public static class CheckOcr
+    {
+        public static int FLAGS = (int)(
+            mupdf.mupdf.FZ_STEXT_COLLECT_STYLES |
+            mupdf.mupdf.FZ_STEXT_COLLECT_VECTORS |
+            (int)TextFlags.TEXT_PRESERVE_IMAGES |
+            (int)TextFlags.TEXT_ACCURATE_BBOXES
+            // | mupdf.mupdf.FZ_STEXT_MEDIABOX_CLIP
+        );
+
+        /// <summary>
+        /// Return OCR'd span text using Tesseract.
+        /// </summary>
+        /// <param name="page">MuPDF Page</param>
+        /// <param name="bbox">MuPDF Rect or its sequence</param>
+        /// <param name="dpi">Resolution for OCR image</param>
+        /// <returns>The OCR-ed text of the bbox.</returns>
+        public static string GetSpanOcr(Page page, Rect bbox, int dpi = 300)
+        {
+            // Step 1: Make a high-resolution image of the bbox.
+            Pixmap pix = page.GetPixmap(dpi: dpi, clip: bbox);
+            byte[] ocrPdfBytes = pix.PdfOCR2Bytes(true);
+            
+            Document ocrPdf = new Document("pdf", ocrPdfBytes);
+            Page ocrPage = ocrPdf.LoadPage(0);
+            string text = ocrPage.GetText();
+            text = text.Replace("\n", " ").Trim(); // Get rid of line breaks
+            
+            ocrPage.Dispose();
+            ocrPdf.Close();
+            pix.Dispose();
+            
+            return text;
+        }
+
+        /// <summary>
+        /// Repair text blocks with missing glyphs using OCR.
+        /// 
+        /// TODO: Support non-linear block structure.
+        /// </summary>
+        public static List<Block> RepairBlocks(List<Block> inputBlocks, Page page, int dpi = 300)
+        {
+            List<Block> repairedBlocks = new List<Block>();
+            
+            foreach (var block in inputBlocks)
+            {
+                if (block.Type != 0) // Accept non-text blocks as is
+                {
+                    repairedBlocks.Add(block);
+                    continue;
+                }
+
+                if (block.Lines != null)
+                {
+                    foreach (var line in block.Lines)
+                    {
+                        if (line.Spans != null)
+                        {
+                            foreach (var span in line.Spans)
+                            {
+                                string spanText = "";
+                                if (span.Chars != null && span.Chars.Count > 0)
+                                {
+                                    spanText = string.Join("", span.Chars.Select(c => c.C));
+                                }
+                                else
+                                {
+                                    spanText = span.Text ?? "";
+                                }
+
+                                if (!spanText.Contains(Utils.REPLACEMENT_CHARACTER))
+                                    continue;
+
+                                int spanTextLen = spanText.Length;
+                                string newText = GetSpanOcr(page, span.Bbox, dpi);
+                                if (newText.Length > spanTextLen)
+                                    newText = newText.Substring(0, spanTextLen);
+
+                                if (span.Chars != null && span.Chars.Count > 0)
+                                {
+                                    // Rebuild chars array
+                                    List<Char> newChars = new List<Char>();
+                                    int minLen = Math.Min(newText.Length, span.Chars.Count);
+                                    for (int i = 0; i < minLen; i++)
+                                    {
+                                        Char oldChar = span.Chars[i];
+                                        Char newChar = new Char
+                                        {
+                                            C = newText[i],
+                                            Origin = oldChar.Origin,
+                                            Bbox = oldChar.Bbox,
+                                            // Copy other properties as needed
+                                        };
+                                        newChars.Add(newChar);
+                                    }
+                                    span.Chars = newChars;
+                                }
+                                else
+                                {
+                                    span.Text = newText;
+                                }
+                            }
+                        }
+                    }
+                }
+                repairedBlocks.Add(block);
+            }
+            
+            return repairedBlocks;
+        }
+
+        /// <summary>
+        /// Determine whether the page contains text worthwhile to OCR.
+        /// </summary>
+        /// <param name="page">MuPDF.NET Page object</param>
+        /// <param name="dpi">DPI used for rasterization *if* we decide to OCR</param>
+        /// <param name="covered">Area to consider for text presence</param>
+        /// <returns>
+        /// The full-page transformation matrix, the full-page pixmap and a
+        /// boolean indicating whether the page is photo-like (True) or
+        /// text-like (False).
+        /// </returns>
+        public static (Matrix matrix, Pixmap pix, bool photo) GetPageImage(
+            Page page, 
+            int dpi = 150, 
+            Rect covered = null)
+        {
+            if (covered == null)
+                covered = page.Rect;
+
+            IRect irect = new IRect((int)covered.X0, (int)covered.Y0, 
+                                   (int)covered.X1, (int)covered.Y1);
+            
+            // Make a gray pixmap of the covered area
+            Rect clipRect = new Rect(covered);
+            Pixmap pixCovered = page.GetPixmap(colorSpace: "gray", clip: clipRect);
+            
+            // Convert to byte array for image quality analysis (convert to numpy array)
+            int width = pixCovered.W;
+            int height = pixCovered.H;
+            byte[] samples = pixCovered.SAMPLES;
+            
+            // Create 2D array for image quality analysis
+            byte[,] gray = new byte[height, width];
+            int sampleIndex = 0;
+            for (int y = 0; y < height; y++)
+            {
+                for (int x = 0; x < width; x++)
+                {
+                    gray[y, x] = samples[sampleIndex++];
+                }
+            }
+
+            // Run photo checks
+            var scores = ImageQuality.AnalyzeImage(gray);
+            double score = scores.ContainsKey("score") ? scores["score"].value : 0;
+            
+            if (score >= 3)
+            {
+                pixCovered.Dispose();
+                return (new Matrix(1, 0, 0, 1, 0, 0), null, true); // Identity matrix
+            }
+            else
+            {
+                Pixmap pix = page.GetPixmap(dpi: dpi);
+                IRect pixRect = new IRect(0, 0, pix.W, pix.H);
+                Matrix matrix = new Matrix(
+                    page.Rect.Width / pix.W,
+                    0,
+                    0,
+                    page.Rect.Height / pix.H,
+                    0,
+                    0
+                );
+                pixCovered.Dispose();
+                return (matrix, pix, false);
+            }
+        }
+
+        /// <summary>
+        /// Decide whether a MuPDF.NET page should be OCR'd.
+        /// </summary>
+        /// <param name="page">MuPDF.NET page object</param>
+        /// <param name="dpi">DPI used for rasterization</param>
+        /// <param name="vectorThresh">Minimum number of vector paths to suggest glyph simulation</param>
+        /// <param name="imageCoverageThresh">Fraction of page area covered by images to trigger OCR</param>
+        /// <param name="textReadabilityThresh">Fraction of readable characters to skip OCR</param>
+        /// <param name="blocks">Output of page.get_text("dict") if already available</param>
+        /// <returns>Dictionary with decision and diagnostic flags</returns>
+        public static Dictionary<string, object> ShouldOcrPage(
+            Page page,
+            int dpi = 150,
+            float vectorThresh = 0.9f,
+            float imageCoverageThresh = 0.9f,
+            float textReadabilityThresh = 0.9f,
+            List<Block> blocks = null)
+        {
+            var decision = new Dictionary<string, object>
+            {
+                ["should_ocr"] = false,
+                ["has_ocr_text"] = false,
+                ["has_text"] = false,
+                ["readable_text"] = false,
+                ["image_covers_page"] = false,
+                ["has_vector_chars"] = false,
+                ["transform"] = new Matrix(1, 0, 0, 1, 0, 0), // Identity matrix
+                ["pixmap"] = null,
+            };
+
+            Rect pageRect = page.Rect;
+            float pageArea = Math.Abs(pageRect.Width * pageRect.Height);
+
+            // Analyze the page
+            var analysis = Utils.AnalyzePage(page, blocks);
+
+            // Return if page is completely blank
+            Rect covered = analysis["covered"] as Rect;
+            if (Utils.BboxIsEmpty(covered))
+            {
+                decision["should_ocr"] = false;
+                return decision;
+            }
+
+            // Return if page has been OCR'd already
+            int ocrSpans = (int)analysis["ocr_spans"];
+            if (ocrSpans > 0)
+            {
+                decision["has_ocr_text"] = true;
+                decision["should_ocr"] = false;
+                return decision;
+            }
+
+            float txtArea = (float)analysis["txt_area"];
+            int charsTotal = (int)analysis["chars_total"];
+            float txtJoins = (float)analysis["txt_joins"];
+            float vecArea = (float)analysis["vec_area"];
+            float imgArea = (float)analysis["img_area"];
+            int charsBad = (int)analysis["chars_bad"];
+
+            // Preset OCR if very little text area exists
+            // Less than 5% text area in covered area
+            if (txtArea < 0.05f && charsTotal < 200 && txtJoins < 0.3f)
+            {
+                if (vecArea >= vectorThresh)
+                {
+                    decision["should_ocr"] = true;
+                    decision["has_vector_chars"] = true;
+                }
+                if (imgArea >= imageCoverageThresh)
+                {
+                    decision["should_ocr"] = true;
+                    decision["image_covers_page"] = true;
+                }
+            }
+            else if (charsTotal >= 200)
+            {
+                decision["has_text"] = true;
+                float readability = 1.0f - (float)charsBad / charsTotal;
+                if (readability >= textReadabilityThresh)
+                {
+                    decision["readable_text"] = true;
+                    decision["should_ocr"] = false;
+                }
+                else
+                {
+                    decision["readable_text"] = false;
+                    decision["should_ocr"] = true;
+                }
+            }
+
+            if (!(bool)decision["should_ocr"])
+                return decision;
+
+            if (!(bool)decision["readable_text"] && (bool)decision["has_text"])
+                return decision;
+
+            // We need OCR and do a final check for potential text presence
+            if (!(bool)decision["has_text"])
+            {
+                // Rasterize and check for photo versus text-heaviness
+                var (matrix, pix, photo) = GetPageImage(page, dpi, covered);
+
+                if (photo)
+                {
+                    // This seems to be a non-text picture page
+                    decision["should_ocr"] = false;
+                    decision["pixmap"] = null;
+                }
+                else
+                {
+                    decision["should_ocr"] = true;
+                    decision["transform"] = matrix;
+                    decision["pixmap"] = pix;
+                }
+            }
+
+            return decision;
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM/helpers/DocumentLayout.cs b/MuPDF.NET4LLM/helpers/DocumentLayout.cs
new file mode 100644
index 0000000..3391fa9
--- /dev/null
+++ b/MuPDF.NET4LLM/helpers/DocumentLayout.cs
@@ -0,0 +1,807 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using MuPDF.NET;
+using Newtonsoft.Json;
+
+namespace MuPDF.NET4LLM.Helpers
+{
+    /// <summary>
+    /// Layout box representing a content region on a page
+    /// </summary>
+    public class LayoutBox
+    {
+        public float X0 { get; set; }
+        public float Y0 { get; set; }
+        public float X1 { get; set; }
+        public float Y1 { get; set; }
+        public string BoxClass { get; set; } // e.g. 'text', 'picture', 'table', etc.
+        // If boxclass == 'picture' or 'formula', store image bytes
+        public byte[] Image { get; set; }
+        // If boxclass == 'table'
+        public Dictionary<string, object> Table { get; set; }
+        // Text line information for text-type boxclasses
+        public List<TextLineInfo> TextLines { get; set; }
+    }
+
+    /// <summary>
+    /// Text line information
+    /// </summary>
+    public class TextLineInfo
+    {
+        public Rect Bbox { get; set; }
+        public List<ExtendedSpan> Spans { get; set; }
+    }
+
+    /// <summary>
+    /// Page layout information
+    /// </summary>
+    public class PageLayout
+    {
+        public int PageNumber { get; set; }
+        public float Width { get; set; }
+        public float Height { get; set; }
+        public List<LayoutBox> Boxes { get; set; }
+        public bool FullOcred { get; set; } // Whether the page is an OCR page
+        public bool TextOcred { get; set; } // Whether the page text only is OCR'd
+        public List<Block> FullText { get; set; } // Full page text in extractDICT format
+        public List<object> Words { get; set; } // List of words with bbox (not yet activated)
+        public List<LinkInfo> Links { get; set; }
+    }
+
+    /// <summary>
+    /// Parsed document structure and layout serialization helpers.
+    /// Ported and adapted from the Python module helpers/document_layout.py in pymupdf4llm.
+    /// </summary>
+    public class ParsedDocument
+    {
+        public string Filename { get; set; } // Source file name
+        public int PageCount { get; set; }
+        public List<object> Toc { get; set; } // e.g. [{'title': 'Intro', 'page': 1}]
+        public List<PageLayout> Pages { get; set; }
+        public Dictionary<string, string> Metadata { get; set; }
+        public Dictionary<string, Dictionary<string, object>> FormFields { get; set; }
+        public bool FromBytes { get; set; } // Whether loaded from bytes
+        public int ImageDpi { get; set; } = 150; // Image resolution
+        public string ImageFormat { get; set; } = "png"; // 'png' or 'jpg'
+        public string ImagePath { get; set; } = ""; // Path to save images
+        public bool UseOcr { get; set; } = true; // If beneficial invoke OCR
+        public bool ForceText { get; set; }
+        public bool EmbedImages { get; set; }
+        public bool WriteImages { get; set; }
+
+        /// <summary>
+        /// Serialize the parsed document into Markdown text, closely following
+        /// <c>ParsedDocument.to_markdown</c> in the original Python implementation.
+        /// </summary>
+        public string ToMarkdown(
+            bool header = true,
+            bool footer = true,
+            bool writeImages = false,
+            bool embedImages = false,
+            bool ignoreCode = false,
+            bool showProgress = false,
+            bool pageSeparators = false,
+            bool pageChunks = false)
+        {
+            if (pageChunks)
+            {
+                throw new NotImplementedException("Page chunks mode not yet fully implemented");
+            }
+
+            var documentOutput = new StringBuilder();
+
+            foreach (var page in Pages)
+            {
+                var mdString = new StringBuilder();
+                // Make a mapping: box number -> list item hierarchy level
+                var listItemLevels = CreateListItemLevels(page.Boxes);
+
+                foreach (var (box, i) in page.Boxes.Select((b, idx) => (b, idx)))
+                {
+                    var clip = new Rect(box.X0, box.Y0, box.X1, box.Y1);
+                    string btype = box.BoxClass;
+
+                    if (btype == "page-header" && !header)
+                        continue;
+                    if (btype == "page-footer" && !footer)
+                        continue;
+
+                    if (btype == "picture" || btype == "formula" || btype == "table-fallback")
+                    {
+                        if (box.Image != null)
+                        {
+                            if (embedImages)
+                            {
+                                // Make a base64 encoded string of the image
+                                string base64 = Convert.ToBase64String(box.Image);
+                                string data = $"data:image/{ImageFormat};base64,{base64}";
+                                mdString.Append($"\n![]({data})\n\n");
+                            }
+                            else if (writeImages)
+                            {
+                                // Save image and reference it
+                                mdString.Append($"\n![Image]({ImagePath})\n\n");
+                            }
+                        }
+                        else
+                        {
+                            mdString.Append($"**==> picture [{clip.Width} x {clip.Height}] intentionally omitted <==**\n\n");
+                        }
+
+                        // Output text in image if requested
+                        if (box.TextLines != null && box.TextLines.Count > 0)
+                        {
+                            mdString.Append(TextToMd(box.TextLines, ignoreCode || page.FullOcred));
+                        }
+                    }
+                    else if (btype == "table" && box.Table != null)
+                    {
+                        if (box.Table.ContainsKey("markdown"))
+                        {
+                            string tableText = box.Table["markdown"].ToString();
+                            if (page.FullOcred)
+                                // Remove code style if page was OCR'd
+                                tableText = tableText.Replace("`", "");
+                            mdString.Append(tableText + "\n\n");
+                        }
+                    }
+                    else if (btype == "list-item")
+                    {
+                        int level = listItemLevels.ContainsKey(i) ? listItemLevels[i] : 1;
+                        mdString.Append(ListItemToMd(box.TextLines, level));
+                    }
+                    else if (btype == "footnote")
+                    {
+                        mdString.Append(FootnoteToMd(box.TextLines));
+                    }
+                    else if (box.TextLines != null)
+                    {
+                        // Treat as normal MD text
+                        mdString.Append(TextToMd(box.TextLines, ignoreCode || page.FullOcred));
+                    }
+                }
+
+                if (pageSeparators)
+                {
+                    mdString.Append($"--- end of page={page.PageNumber} ---\n\n");
+                }
+
+                documentOutput.Append(mdString.ToString());
+            }
+
+            return documentOutput.ToString();
+        }
+
+        /// <summary>
+        /// Serialize the parsed document into a JSON string, mirroring the behavior
+        /// of the Python <c>ParsedDocument.to_json</c> helper.
+        /// </summary>
+        public string ToJson()
+        {
+            // Serialize to JSON
+            var settings = new JsonSerializerSettings
+            {
+                Formatting = Formatting.Indented,
+                NullValueHandling = NullValueHandling.Ignore,
+                Converters = new List<JsonConverter>
+                {
+                    new LayoutJsonConverter()
+                }
+            };
+
+            return JsonConvert.SerializeObject(this, settings);
+        }
+
+        /// <summary>
+        /// Serialize the parsed document to plain text.
+        /// This follows the logic of <c>ParsedDocument.to_text</c> in the Python code,
+        /// including optional suppression of headers / footers and simple table rendering.
+        /// </summary>
+        public string ToText(
+            bool header = true,
+            bool footer = true,
+            bool ignoreCode = false,
+            bool showProgress = false,
+            bool pageChunks = false,
+            string tableFormat = "grid")
+        {
+            if (pageChunks)
+            {
+                throw new NotImplementedException("Page chunks mode not yet fully implemented");
+            }
+
+            var documentOutput = new StringBuilder();
+
+            foreach (var page in Pages)
+            {
+                var textString = new StringBuilder();
+                var listItemLevels = CreateListItemLevels(page.Boxes);
+
+                foreach (var (box, i) in page.Boxes.Select((b, idx) => (b, idx)))
+                {
+                    var clip = new Rect(box.X0, box.Y0, box.X1, box.Y1);
+                    string btype = box.BoxClass;
+
+                    if (btype == "page-header" && !header)
+                        continue;
+                    if (btype == "page-footer" && !footer)
+                        continue;
+
+                    if (btype == "picture" || btype == "formula" || btype == "table-fallback")
+                    {
+                        textString.Append($"==> picture [{clip.Width} x {clip.Height}] <==\n\n");
+                        if (box.TextLines != null && box.TextLines.Count > 0)
+                        {
+                            textString.Append(TextToText(box.TextLines, ignoreCode || page.FullOcred));
+                        }
+                    }
+                    else if (btype == "table" && box.Table != null)
+                    {
+                        // Note: Table formatting would need tabulate equivalent
+                        textString.Append("[Table]\n\n");
+                    }
+                    else if (btype == "list-item")
+                    {
+                        int level = listItemLevels.ContainsKey(i) ? listItemLevels[i] : 1;
+                        textString.Append(ListItemToText(box.TextLines, level));
+                    }
+                    else if (btype == "footnote")
+                    {
+                        textString.Append(FootnoteToText(box.TextLines));
+                    }
+                    else if (box.TextLines != null)
+                    {
+                        // Handle other cases as normal text
+                        textString.Append(TextToText(box.TextLines, ignoreCode || page.FullOcred));
+                    }
+                }
+
+                documentOutput.Append(textString.ToString());
+            }
+
+            return documentOutput.ToString();
+        }
+
+        // Helper methods for text conversion
+        private static string TitleToMd(List<TextLineInfo> textLines)
+        {
+            var sb = new StringBuilder();
+            foreach (var line in textLines)
+            {
+                if (line.Spans != null)
+                {
+                    foreach (var span in line.Spans)
+                    {
+                        sb.Append(span.Text);
+                    }
+                }
+                sb.Append("\n");
+            }
+            return sb.ToString();
+        }
+
+        private static string SectionHdrToMd(List<TextLineInfo> textLines)
+        {
+            var sb = new StringBuilder();
+            foreach (var line in textLines)
+            {
+                if (line.Spans != null)
+                {
+                    foreach (var span in line.Spans)
+                    {
+                        sb.Append(span.Text);
+                    }
+                }
+                sb.Append("\n");
+            }
+            return sb.ToString();
+        }
+
+        /// <summary>
+        /// Convert <c>list-item</c> layout boxes to markdown.
+        /// The first line is prefixed with <c>-</c>. Subsequent lines are appended
+        /// without a line break if their rectangle does not start to the left of
+        /// the previous line; otherwise a new markdown list item is started.
+        /// 2 units of tolerance is used to avoid spurious line breaks.
+        /// </summary>
+        /// <param name="textLines">The text line information for the list item.</param>
+        /// <param name="level">The hierarchy level (1 for top-level).</param>
+        private static string ListItemToMd(List<TextLineInfo> textLines, int level)
+        {
+            var sb = new StringBuilder();
+            string indent = new string(' ', (level - 1) * 2); // Indentation based on level
+            sb.Append(indent + "- ");
+            foreach (var line in textLines)
+            {
+                if (line.Spans != null)
+                {
+                    foreach (var span in line.Spans)
+                    {
+                        sb.Append(span.Text);
+                    }
+                }
+            }
+            sb.Append("\n");
+            return sb.ToString();
+        }
+
+        /// <summary>
+        /// Convert <c>footnote</c> layout boxes to markdown.
+        /// The first line is prefixed with <c>&gt; </c>; subsequent lines start a
+        /// new blockquote when they begin with superscripted text.
+        /// We render footnotes as blockquotes.
+        /// </summary>
+        /// <param name="textLines">The text line information for the footnote.</param>
+        private static string FootnoteToMd(List<TextLineInfo> textLines)
+        {
+            var sb = new StringBuilder();
+            // We render footnotes as blockquotes
+            sb.Append("[^");
+            foreach (var line in textLines)
+            {
+                if (line.Spans != null)
+                {
+                    foreach (var span in line.Spans)
+                    {
+                        sb.Append(span.Text);
+                    }
+                }
+            }
+            sb.Append("]\n");
+            return sb.ToString();
+        }
+
+        /// <summary>
+        /// Convert generic text layout boxes to markdown, as well as box classes
+        /// not specifically handled elsewhere. Lines are concatenated without
+        /// line breaks; at the end, two newlines are used to separate from the
+        /// next block. Monospaced spans may be emitted as code when
+        /// <paramref name="ignoreCode" /> is <c>false</c>.
+        /// </summary>
+        /// <param name="textLines">The text line information to convert.</param>
+        /// <param name="ignoreCode">If true, do not emit code-style formatting.</param>
+        private static string TextToMd(List<TextLineInfo> textLines, bool ignoreCode)
+        {
+            // Handle completely monospaced textlines as code block
+            // Check for superscript - handle mis-classified text boundary box
+            if (textLines == null || textLines.Count == 0)
+                return "";
+            
+            var sb = new StringBuilder();
+            foreach (var line in textLines)
+            {
+                if (line.Spans != null)
+                {
+                    foreach (var span in line.Spans)
+                    {
+                        string text = span.Text ?? "";
+                        if (!ignoreCode && span.Font != null && span.Font.Contains("Mono"))
+                        {
+                            text = "`" + text + "`";
+                        }
+                        sb.Append(text);
+                    }
+                }
+                sb.Append("\n");
+            }
+            return sb.ToString();
+        }
+
+        /// <summary>
+        /// Convert <c>list-item</c> layout boxes to plain text.
+        /// The first line is prefixed with a dash and indentation according to
+        /// the hierarchy level; subsequent lines are concatenated.
+        /// </summary>
+        /// <param name="textLines">The text line information for the list item.</param>
+        /// <param name="level">The hierarchy level (1 for top-level).</param>
+        private static string ListItemToText(List<TextLineInfo> textLines, int level)
+        {
+            if (textLines == null || textLines.Count == 0)
+                return "";
+            
+            var sb = new StringBuilder();
+            string indent = new string(' ', (level - 1) * 2); // Indentation based on level
+            sb.Append(indent + "- ");
+            foreach (var line in textLines)
+            {
+                if (line.Spans != null)
+                {
+                    foreach (var span in line.Spans)
+                    {
+                        sb.Append(span.Text);
+                    }
+                }
+            }
+            sb.Append("\n");
+            return sb.ToString();
+        }
+
+        /// <summary>
+        /// Convert <c>footnote</c> layout boxes to plain text, concatenating
+        /// all spans into a single textual representation.
+        /// We render footnotes as blockquotes.
+        /// </summary>
+        /// <param name="textLines">The text line information for the footnote.</param>
+        private static string FootnoteToText(List<TextLineInfo> textLines)
+        {
+            if (textLines == null || textLines.Count == 0)
+                return "";
+            
+            var sb = new StringBuilder();
+            // We render footnotes as blockquotes
+            foreach (var line in textLines)
+            {
+                if (line.Spans != null)
+                {
+                    foreach (var span in line.Spans)
+                    {
+                        sb.Append(span.Text);
+                    }
+                }
+            }
+            sb.Append("\n");
+            return sb.ToString();
+        }
+
+        /// <summary>
+        /// Convert generic text layout boxes to plain text. The text of all
+        /// spans of all lines is written without line breaks.
+        /// At the end, two newlines are added to separate from the next block.
+        /// </summary>
+        /// <param name="textLines">The text line information to convert.</param>
+        /// <param name="ignoreCode">Currently unused; included for parity with markdown conversion.</param>
+        private static string TextToText(List<TextLineInfo> textLines, bool ignoreCode)
+        {
+            if (textLines == null || textLines.Count == 0)
+                return "";
+            
+            var sb = new StringBuilder();
+            foreach (var line in textLines)
+            {
+                if (line.Spans != null)
+                {
+                    foreach (var span in line.Spans)
+                    {
+                        sb.Append(span.Text);
+                    }
+                }
+                sb.Append("\n");
+            }
+            return sb.ToString();
+        }
+
+        /// <summary>
+        /// Map the layout box index of each list item to its hierarchy level.
+        ///
+        /// This post-layout heuristic walks contiguous segments of <c>list-item</c>
+        /// boxes and assigns increasing levels when the left coordinate moves
+        /// sufficiently to the right, mirroring
+        /// <c>create_list_item_levels</c> in the Python implementation.
+        /// </summary>
+        /// <param name="boxes">The list of layout boxes for the page.</param>
+        /// <returns>
+        /// A dictionary mapping box index to level, where level is 1 for
+        /// top-level items.
+        /// </returns>
+        private static Dictionary<int, int> CreateListItemLevels(List<LayoutBox> boxes)
+        {
+            var itemDict = new Dictionary<int, int>(); // Dictionary of item index -> level
+            var segments = new List<List<(int idx, LayoutBox box)>>(); // List of item segments
+            var currentSegment = new List<(int idx, LayoutBox box)>(); // Current segment
+
+            // Create segments of contiguous list items. Each non-list-item finishes
+            // the current segment. Also, two list-items in a row belonging to different
+            // page text columns end the segment after the first item.
+            for (int i = 0; i < boxes.Count; i++)
+            {
+                var box = boxes[i];
+                if (box.BoxClass != "list-item") // Bbox class is no list-item
+                {
+                    if (currentSegment.Count > 0) // End and save the current segment
+                    {
+                        segments.Add(currentSegment);
+                        currentSegment = new List<(int idx, LayoutBox box)>();
+                    }
+                    continue;
+                }
+
+                if (currentSegment.Count > 0) // Check if we need to end the current segment
+                {
+                    var (prevIdx, prevBox) = currentSegment[currentSegment.Count - 1];
+                    if (box.X0 > prevBox.X1 || box.Y1 < prevBox.Y0)
+                    {
+                        // End and save the current segment
+                        segments.Add(currentSegment);
+                        currentSegment = new List<(int idx, LayoutBox box)>();
+                    }
+                }
+                currentSegment.Add((i, box)); // Append item to segment
+            }
+            if (currentSegment.Count > 0)
+                segments.Add(currentSegment); // Append last segment
+
+            // Walk through segments and assign levels
+            foreach (var segment in segments)
+            {
+                if (segment.Count == 0) continue; // Skip empty segments
+                var sorted = segment.OrderBy(x => x.box.X0).ToList(); // Sort by x0 coordinate of the bbox
+
+                // List of leveled items in the segment: (idx, bbox, level)
+                // First item has level 1
+                var leveled = new List<(int idx, LayoutBox box, int level)>
+                {
+                    (sorted[0].idx, sorted[0].box, 1)
+                };
+
+                for (int i = 1; i < sorted.Count; i++)
+                {
+                    var (prevIdx, prevBox, prevLvl) = leveled[leveled.Count - 1];
+                    var (currIdx, currBox) = sorted[i];
+                    // X0 coordinate increased by more than 10 points: increase level
+                    int currLvl = currBox.X0 > prevBox.X0 + 10 ? prevLvl + 1 : prevLvl;
+                    leveled.Add((currIdx, currBox, currLvl));
+                }
+
+                foreach (var (idx, box, lvl) in leveled)
+                {
+                    itemDict[idx] = lvl;
+                }
+            }
+
+            return itemDict;
+        }
+    }
+
+        /// <summary>
+        /// Document layout parsing utilities.
+        /// Provides a C# equivalent of <c>pymupdf4llm.helpers.document_layout.parse_document</c>.
+        /// </summary>
+    public static class DocumentLayout
+    {
+        /// <summary>
+        /// Parse document structure
+        /// </summary>
+        public static ParsedDocument ParseDocument(
+            Document doc,
+            string filename = "",
+            int imageDpi = 150,
+            string imageFormat = "png",
+            string imagePath = "",
+            int ocrDpi = 400,
+            List<int> pages = null,
+            bool writeImages = false,
+            bool embedImages = false,
+            bool showProgress = false,
+            bool forceText = true,
+            bool useOcr = true,
+            string ocrLanguage = "eng")
+        {
+            // Note: Remove StructTreeRoot to avoid possible performance degradation.
+            // We will not use the structure tree anyway.
+            if (embedImages && writeImages)
+                throw new ArgumentException("Cannot both embed and write images.");
+
+            var document = new ParsedDocument
+            {
+                Filename = !string.IsNullOrEmpty(filename) ? filename : doc.Name,
+                PageCount = doc.PageCount,
+                Toc = doc.GetToc().Cast<object>().ToList(),
+                Metadata = doc.MetaData,
+                FormFields = Utils.ExtractFormFieldsWithPages(doc),
+                ImageDpi = imageDpi,
+                ImageFormat = imageFormat,
+                ImagePath = imagePath,
+                UseOcr = useOcr,
+                ForceText = forceText,
+                EmbedImages = embedImages,
+                WriteImages = writeImages,
+                Pages = new List<PageLayout>()
+            };
+
+            if (pages == null)
+                pages = Enumerable.Range(0, doc.PageCount).ToList();
+
+            var progressBar = showProgress && pages.Count > 5
+                ? ProgressBar.Create(pages.Cast<object>().ToList())
+                : null;
+
+            try
+            {
+                foreach (int pno in pages)
+                {
+                    if (progressBar != null && !progressBar.MoveNext())
+                        break;
+
+                    Page page = doc.LoadPage(pno);
+                    try
+                    {
+                        TextPage textPage = page.GetTextPage(
+                            clip: new Rect(float.NegativeInfinity, float.NegativeInfinity,
+                                          float.PositiveInfinity, float.PositiveInfinity),
+                            flags: Utils.FLAGS);
+                        PageInfo pageInfo = textPage.ExtractDict(null, false);
+                        List<Block> blocks = pageInfo.Blocks;
+
+                        bool pageFullOcred = false;
+                        bool pageTextOcred = false;
+
+                        // Check if this page should be OCR'd
+                        if (useOcr)
+                        {
+                            var decision = CheckOcr.ShouldOcrPage(page, dpi: ocrDpi, blocks: blocks);
+                            // Prevent MD styling if already OCR'd
+                            pageFullOcred = decision.TryGetValue("has_ocr_text", out var hasOcrText) ? (bool)hasOcrText : false;
+
+                            if (decision.TryGetValue("should_ocr", out var shouldOcr) && (bool)shouldOcr)
+                            {
+                        // We should be OCR: check full-page vs. text-only
+                        if (decision.ContainsKey("pixmap") && decision["pixmap"] != null)
+                        {
+                            // Full-page OCR would be implemented here
+                            // Retrieve the Pixmap, OCR it, get the OCR'd PDF, copy text over to original page
+                            pageFullOcred = true;
+                        }
+                        else
+                        {
+                            blocks = CheckOcr.RepairBlocks(blocks, page);
+                            pageTextOcred = true;
+                        }
+                            }
+                        }
+
+                        var pageLayout = new PageLayout
+                        {
+                            PageNumber = pno,
+                            Width = page.Rect.Width,
+                            Height = page.Rect.Height,
+                            Boxes = new List<LayoutBox>(),
+                            FullOcred = pageFullOcred,
+                            TextOcred = pageTextOcred,
+                            FullText = blocks,
+                            Words = new List<object>(),
+                            Links = page.GetLinks()
+                        };
+
+                        // Extract text lines for each block
+                        // Each line is represented as its bbox and a list of spans
+                        var lines = GetTextLines.GetRawLines(textPage, blocks, page.Rect);
+
+                        foreach (var line in lines)
+                        {
+                            var layoutBox = new LayoutBox
+                            {
+                                X0 = line.Rect.X0,
+                                Y0 = line.Rect.Y0,
+                                X1 = line.Rect.X1,
+                                Y1 = line.Rect.Y1,
+                                BoxClass = "text",
+                                TextLines = new List<TextLineInfo>
+                                {
+                                    new TextLineInfo
+                                    {
+                                        Bbox = line.Rect,
+                                        Spans = line.Spans
+                                    }
+                                }
+                            };
+
+                            pageLayout.Boxes.Add(layoutBox);
+                        }
+
+                        document.Pages.Add(pageLayout);
+                        textPage.Dispose();
+                    }
+                    finally
+                    {
+                        page.Dispose();
+                    }
+                }
+            }
+            finally
+            {
+                progressBar?.Dispose();
+            }
+
+            return document;
+        }
+    }
+
+    /// <summary>
+    /// Custom JSON converter for Layout objects
+    /// </summary>
+    public class LayoutJsonConverter : JsonConverter
+    {
+        public override bool CanConvert(Type objectType)
+        {
+            return objectType == typeof(byte[]) ||
+                   objectType == typeof(Rect) ||
+                   objectType == typeof(Point) ||
+                   objectType == typeof(Matrix) ||
+                   objectType == typeof(IRect) ||
+                   objectType == typeof(Quad);
+        }
+
+        public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer)
+        {
+            throw new NotImplementedException("Deserialization not implemented");
+        }
+
+        public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer)
+        {
+            if (value == null)
+            {
+                writer.WriteNull();
+                return;
+            }
+
+            if (value is byte[] bytes)
+            {
+                string base64 = Convert.ToBase64String(bytes);
+                writer.WriteValue(base64);
+            }
+            else if (value is Rect rect)
+            {
+                writer.WriteStartArray();
+                writer.WriteValue(rect.X0);
+                writer.WriteValue(rect.Y0);
+                writer.WriteValue(rect.X1);
+                writer.WriteValue(rect.Y1);
+                writer.WriteEndArray();
+            }
+            else if (value is Point point)
+            {
+                writer.WriteStartArray();
+                writer.WriteValue(point.X);
+                writer.WriteValue(point.Y);
+                writer.WriteEndArray();
+            }
+            else if (value is Matrix matrix)
+            {
+                writer.WriteStartArray();
+                writer.WriteValue(matrix.A);
+                writer.WriteValue(matrix.B);
+                writer.WriteValue(matrix.C);
+                writer.WriteValue(matrix.D);
+                writer.WriteValue(matrix.E);
+                writer.WriteValue(matrix.F);
+                writer.WriteEndArray();
+            }
+            else if (value is IRect irect)
+            {
+                writer.WriteStartArray();
+                writer.WriteValue(irect.X0);
+                writer.WriteValue(irect.Y0);
+                writer.WriteValue(irect.X1);
+                writer.WriteValue(irect.Y1);
+                writer.WriteEndArray();
+            }
+            else if (value is Quad quad)
+            {
+                writer.WriteStartArray();
+                writer.WriteStartArray();
+                writer.WriteValue(quad.UpperLeft.X);
+                writer.WriteValue(quad.UpperLeft.Y);
+                writer.WriteEndArray();
+                writer.WriteStartArray();
+                writer.WriteValue(quad.UpperRight.X);
+                writer.WriteValue(quad.UpperRight.Y);
+                writer.WriteEndArray();
+                writer.WriteStartArray();
+                writer.WriteValue(quad.LowerLeft.X);
+                writer.WriteValue(quad.LowerLeft.Y);
+                writer.WriteEndArray();
+                writer.WriteStartArray();
+                writer.WriteValue(quad.LowerRight.X);
+                writer.WriteValue(quad.LowerRight.Y);
+                writer.WriteEndArray();
+                writer.WriteEndArray();
+            }
+            else
+            {
+                writer.WriteNull();
+            }
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM/helpers/ExtendedSpan.cs b/MuPDF.NET4LLM/helpers/ExtendedSpan.cs
new file mode 100644
index 0000000..5c9e4c8
--- /dev/null
+++ b/MuPDF.NET4LLM/helpers/ExtendedSpan.cs
@@ -0,0 +1,24 @@
+using System.Collections.Generic;
+using MuPDF.NET;
+
+namespace MuPDF.NET4LLM.Helpers
+{
+    /// <summary>
+    /// Extended span information for text line extraction.
+    /// Mirrors the span dictionaries produced by pymupdf4llm in the Python helpers.
+    /// </summary>
+    public class ExtendedSpan
+    {
+        public string Text { get; set; }
+        public Rect Bbox { get; set; }
+        public float Size { get; set; }
+        public string Font { get; set; }
+        public int Flags { get; set; }
+        public int CharFlags { get; set; }
+        public float Alpha { get; set; }
+        public int Line { get; set; }
+        public int Block { get; set; }
+        public Point Dir { get; set; }
+        public List<Char> Chars { get; set; }
+    }
+}
diff --git a/MuPDF.NET4LLM/helpers/GetTextLines.cs b/MuPDF.NET4LLM/helpers/GetTextLines.cs
new file mode 100644
index 0000000..58e23c6
--- /dev/null
+++ b/MuPDF.NET4LLM/helpers/GetTextLines.cs
@@ -0,0 +1,442 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using MuPDF.NET;
+
+namespace MuPDF.NET4LLM.Helpers
+{
+    /// <summary>
+    /// Represents a line with its rectangle and spans
+    /// </summary>
+    public class TextLine
+    {
+        public Rect Rect { get; set; }
+        public List<ExtendedSpan> Spans { get; set; }
+    }
+
+    /// <summary>
+    /// Text line extraction utilities.
+    /// Ported and adapted from the Python module helpers/get_text_lines.py in pymupdf4llm.
+    /// </summary>
+    public static class GetTextLines
+    {
+        /// <summary>
+        /// Extract the text spans from a <see cref="TextPage"/> in natural reading order.
+        /// All spans whose vertical positions are within <paramref name="tolerance"/> of
+        /// each other are merged into a single logical line, mirroring the behavior of
+        /// <c>get_raw_lines()</c> in <c>helpers/get_text_lines.py</c>.
+        /// </summary>
+        /// <remarks>
+        /// This is a C# port of <c>pymupdf4llm.helpers.get_text_lines.get_raw_lines</c>.
+        /// It compensates for MuPDF’s tendency to create multiple short lines when spans
+        /// are separated by small gaps, by joining adjacent spans into longer lines.
+        ///
+        /// The result is a list of <see cref="TextLine"/> objects, each containing a
+        /// joined line rectangle and a left‑to‑right sorted list of <see cref="ExtendedSpan"/>
+        /// items. Each span is annotated with its original block / line index so that
+        /// callers can still detect original MuPDF line breaks if needed.
+        /// </remarks>
+        /// <param name="textpage">
+        /// Source <see cref="TextPage"/>. May be <c>null</c> if <paramref name="blocks"/>
+        /// are provided directly.
+        /// </param>
+        /// <param name="blocks">
+        /// Optional list of <see cref="Block"/> objects to reuse an existing
+        /// <c>ExtractDict</c> result instead of re‑extracting from <paramref name="textpage"/>.
+        /// Only text blocks (Type == 0) with non‑empty bounding boxes are considered.
+        /// </param>
+        /// <param name="clip">
+/// Optional clipping rectangle. Only spans whose bounding boxes overlap this
+/// area (within <see cref="Utils.AlmostInBbox"/>) are taken into account.
+        /// </param>
+        /// <param name="tolerance">
+        /// Maximum vertical distance (in points) between span baselines or tops for
+        /// them to be considered part of the same logical line (default: 3).
+        /// </param>
+        /// <param name="ignoreInvisible">
+        /// When <c>true</c>, spans with zero alpha (invisible text) are skipped, except
+        /// for Type 3 fonts (which are always kept), matching the Python logic.
+        /// </param>
+        /// <param name="onlyHorizontal">
+        /// When <c>true</c>, only spans with approximately horizontal direction
+        /// vectors are included (i.e. <c>abs(1 - dir.x) &lt;= 1e‑3</c>), ignoring
+        /// vertical or rotated text.
+        /// </param>
+        /// <returns>
+        /// A list of <see cref="TextLine"/> objects. If no spans are found, an
+        /// empty list is returned.
+        /// </returns>
+        public static List<TextLine> GetRawLines(
+            TextPage textpage = null,
+            List<Block> blocks = null,
+            Rect clip = null,
+            float tolerance = 3.0f,
+            bool ignoreInvisible = true,
+            bool onlyHorizontal = true)
+        {
+            float yDelta = tolerance; // Allowable vertical coordinate deviation
+
+            if (textpage == null && blocks == null)
+                throw new ArgumentException("Either textpage or blocks must be provided.");
+
+            if (clip == null && textpage != null)
+            {
+                // Use TextPage rect if not provided
+                clip = new Rect(float.NegativeInfinity, float.NegativeInfinity,
+                              float.PositiveInfinity, float.PositiveInfinity);
+            }
+
+            // Extract text blocks - if bbox is not empty
+            if (blocks == null && textpage != null)
+            {
+                PageInfo pageInfo = textpage.ExtractDict(null, false);
+                blocks = pageInfo.Blocks?.Where(b => b.Type == 0 && !Utils.BboxIsEmpty(b.Bbox)).ToList();
+            }
+
+            if (blocks == null)
+                blocks = new List<Block>();
+
+            List<ExtendedSpan> spans = new List<ExtendedSpan>(); // All spans in TextPage here
+
+            for (int bno = 0; bno < blocks.Count; bno++) // The numbered blocks
+            {
+                Block b = blocks[bno];
+                if (Utils.OutsideBbox(b.Bbox, clip))
+                    continue;
+
+                if (b.Lines == null)
+                    continue;
+
+                for (int lno = 0; lno < b.Lines.Count; lno++) // The numbered lines
+                {
+                    Line line = b.Lines[lno];
+                    if (Utils.OutsideBbox(line.Bbox, clip))
+                        continue;
+
+                    Point lineDir = line.Dir;
+                    if (onlyHorizontal && Math.Abs(1 - lineDir.X) > 1e-3) // Only accept horizontal text
+                        continue;
+
+                    if (line.Spans == null)
+                        continue;
+
+                    for (int sno = 0; sno < line.Spans.Count; sno++) // The numbered spans
+                    {
+                        Span s = line.Spans[sno];
+                        string text = s.Text ?? "";
+
+                        if (Utils.IsWhite(text))
+                            // Ignore white text if not a Type3 font
+                            continue;
+
+                        // Ignore invisible text. Type 3 font text is never invisible.
+                        // Note: Alpha and CharFlags may need different access in MuPDF.NET
+                        if (s.Font != Utils.TYPE3_FONT_NAME && ignoreInvisible)
+                        {
+                            // Skip invisible text if needed - would need Alpha property
+                            // For now, continue
+                        }
+
+                        if (!Utils.AlmostInBbox(s.Bbox, clip)) // If not in clip
+                            continue;
+
+                        Rect sbbox = new Rect(s.Bbox); // Span bbox as a Rect
+                        if (((int)s.Flags & 1) != 0) // If a superscript, modify bbox
+                        {
+                            // With that of the preceding or following span
+                            int i = sno == 0 ? 1 : sno - 1;
+                            if (line.Spans.Count > i)
+                            {
+                                Span neighbor = line.Spans[i];
+                                sbbox.Y1 = neighbor.Bbox.Y1;
+                            }
+                            text = $"[{text}]";
+                        }
+
+                        sbbox = sbbox; // Update with the Rect version
+                        // Include line/block numbers to facilitate separator insertion
+                        ExtendedSpan extSpan = new ExtendedSpan
+                        {
+                            Text = text,
+                            Bbox = sbbox,
+                            Size = s.Size,
+                            Font = s.Font,
+                            Flags = (int)s.Flags,
+                            CharFlags = 0, // Would need to extract from Span if available
+                            Alpha = 1.0f, // Would need to extract from Span if available
+                            Line = lno,
+                            Block = bno,
+                            Dir = lineDir,
+                            Chars = s.Chars
+                        };
+
+                        spans.Add(extSpan);
+                    }
+                }
+            }
+
+            if (spans.Count == 0) // No text at all
+                return new List<TextLine>();
+
+            // Sort spans by bottom coord
+            spans = spans.OrderBy(s => -s.Dir.X).ThenBy(s => s.Bbox.Y1).ToList();
+
+            List<TextLine> nlines = new List<TextLine>(); // Final result
+            List<ExtendedSpan> currentLine = new List<ExtendedSpan> { spans[0] }; // Collects spans with fitting vertical coordinates
+            Rect lrect = new Rect(spans[0].Bbox); // Rectangle joined from span rectangles
+
+            for (int i = 1; i < spans.Count; i++) // Walk through the spans
+            {
+                ExtendedSpan s = spans[i];
+                Rect sbbox = s.Bbox; // This bbox
+                Rect sbbox0 = currentLine[currentLine.Count - 1].Bbox; // Previous bbox
+                // If any of top or bottom coordinates are close enough, join...
+                if (Math.Abs(sbbox.Y1 - sbbox0.Y1) <= yDelta ||
+                    Math.Abs(sbbox.Y0 - sbbox0.Y0) <= yDelta)
+                {
+                    currentLine.Add(s); // Append to this line
+                    lrect = Utils.JoinRects(new List<Rect> { lrect, sbbox }); // Extend line rectangle
+                    continue;
+                }
+
+                // End of current line, sort its spans from left to right
+                currentLine = SanitizeSpans(currentLine);
+
+                // Append line rect and its spans to final output
+                nlines.Add(new TextLine { Rect = lrect, Spans = currentLine });
+
+                currentLine = new List<ExtendedSpan> { s }; // Start next line
+                lrect = new Rect(sbbox); // Initialize its rectangle
+            }
+
+            // Need to append last line in the same way
+            currentLine = SanitizeSpans(currentLine);
+            nlines.Add(new TextLine { Rect = lrect, Spans = currentLine });
+
+            return nlines;
+        }
+
+        /// <summary>
+        /// Sort and join spans within a single logical line.
+        /// </summary>
+        /// <remarks>
+        /// This corresponds to the inner <c>sanitize_spans()</c> helper in
+        /// <c>get_text_lines.py</c>. Spans are first sorted left‑to‑right and then
+        /// adjacent spans with nearly touching x‑coordinates and identical style
+        /// (font flags and character flags, except superscript) are merged into a
+        /// single <see cref="ExtendedSpan"/> by concatenating their text and
+        /// joining their bounding boxes.
+        /// </remarks>
+        private static List<ExtendedSpan> SanitizeSpans(List<ExtendedSpan> line)
+        {
+            if (line.Count == 0)
+                return line;
+
+            // Sort ascending horizontally
+            line = line.OrderBy(s => s.Bbox.X0).ToList();
+            // Join spans, delete duplicates
+            // Underline differences are being ignored
+            for (int i = line.Count - 1; i > 0; i--) // Iterate back to front
+            {
+                ExtendedSpan s0 = line[i - 1]; // Preceding span
+                ExtendedSpan s1 = line[i]; // This span
+                // "Delta" depends on the font size. Spans will be joined if
+                // no more than 10% of the font size separates them and important
+                // attributes are the same.
+                float delta = s1.Size * 0.1f;
+                if (s0.Bbox.X1 + delta < s1.Bbox.X0 ||
+                    s0.Flags != s1.Flags ||
+                    (s0.CharFlags & ~2) != (s1.CharFlags & ~2))
+                {
+                    continue; // No joining
+                }
+                // We need to join bbox and text of two consecutive spans
+                // Sometimes, spans may also be duplicated.
+                if (s0.Text != s1.Text || !s0.Bbox.EqualTo(s1.Bbox))
+                {
+                    s0.Text += s1.Text;
+                }
+                s0.Bbox = Utils.JoinRects(new List<Rect> { s0.Bbox, s1.Bbox }); // Join boundary boxes
+                line.RemoveAt(i); // Delete the joined-in span
+                line[i - 1] = s0; // Update the span
+            }
+
+            return line;
+        }
+
+        /// <summary>
+        /// Extract plain text line‑by‑line in natural reading order.
+        /// </summary>
+        /// <remarks>
+        /// This is the C# equivalent of <c>get_text_lines()</c> in
+        /// <c>helpers/get_text_lines.py</c>. It first obtains logical lines via
+        /// <see cref="GetRawLines(MuPDF.NET.TextPage,System.Collections.Generic.List{MuPDF.NET.Block},MuPDF.NET.Rect,float,bool,bool)"/>,
+        /// then concatenates spans on the same original MuPDF line, inserting the
+        /// separator <paramref name="sep"/> when a new original line continues the
+        /// same logical line.
+        ///
+        /// For non‑OCR text (<paramref name="ocr"/> = <c>false</c>), this produces
+        /// continuous text suitable for indexing while preserving a reasonable
+        /// reading order, including extra blank lines between text blocks.
+        ///
+        /// When <paramref name="ocr"/> is <c>true</c>, a simplified table recognition
+        /// is applied to the OCR output: lines are grouped into columns based on
+        /// x‑coordinates and emitted as a Markdown table, analogous to the Python
+        /// implementation.
+        /// </remarks>
+        /// <param name="page">The source <see cref="Page"/> to extract text from.</param>
+        /// <param name="textpage">
+        /// Optional pre‑created <see cref="TextPage"/>. When <c>null</c>, this method
+/// will create a temporary text page (or OCR text page if <paramref name="ocr"/>
+/// is <c>true</c>) and dispose it afterwards.
+        /// </param>
+        /// <param name="clip">
+        /// Optional clipping rectangle restricting the area from which lines are read.
+        /// </param>
+        /// <param name="sep">
+        /// Separator string used when joining multiple MuPDF lines that are merged
+        /// into a single logical line (default: tab, matching the Python version).
+        /// </param>
+        /// <param name="tolerance">
+        /// Vertical tolerance passed through to <see cref="GetRawLines"/>.
+        /// </param>
+        /// <param name="ocr">
+        /// When <c>true</c>, uses OCR text extraction and applies rudimentary
+        /// table reconstruction, returning a Markdown‑style table for tabular OCR output.
+        /// </param>
+        /// <returns>
+        /// A string containing the page text in reading order. For non‑OCR mode,
+        /// this is plain text with line breaks and block separators. For OCR mode,
+        /// it may contain Markdown‑style tables.
+        /// </returns>
+        public static string GetTextLinesFormatted(
+            Page page,
+            TextPage textpage = null,
+            Rect clip = null,
+            string sep = "\t",
+            float tolerance = 3.0f,
+            bool ocr = false)
+        {
+            int textFlags = (int)TextFlags.TEXT_MEDIABOX_CLIP;
+            page.SetRotation(0);
+            Rect prect = clip ?? page.Rect; // Area to consider
+
+            string xsep = sep == "|" ? "" : sep;
+
+            // Make a TextPage if required
+            TextPage tp = textpage;
+            bool disposeTp = false;
+
+            if (tp == null)
+            {
+                if (!ocr)
+                {
+                    tp = page.GetTextPage(clip: prect, flags: textFlags);
+                }
+                else
+                {
+                    tp = page.GetTextPageOcr(dpi: 300, full: true);
+                }
+                disposeTp = true;
+            }
+
+            List<TextLine> lines = GetRawLines(tp, null, prect, tolerance);
+
+            if (disposeTp) // Delete temp TextPage
+            {
+                tp?.Dispose();
+            }
+
+            if (lines == null || lines.Count == 0)
+                return "";
+
+            string alltext = "";
+
+            // Compose final text
+            if (!ocr)
+            {
+                int prevBno = -1; // Number of previous text block
+                foreach (var (lrect, line) in lines.Select(l => (l.Rect, l.Spans))) // Iterate through lines
+                {
+                    // Insert extra line break if a different block
+                    int bno = line[0].Block; // Block number of this line
+                    if (bno != prevBno)
+                    {
+                        alltext += "\n";
+                    }
+                    prevBno = bno;
+
+                    int lineNo = line[0].Line; // Store the line number of previous span
+                    foreach (var s in line) // Walk over the spans in the line
+                    {
+                        int lno = s.Line;
+                        string stext = s.Text;
+                        if (lineNo == lno)
+                        {
+                            alltext += stext;
+                        }
+                        else
+                        {
+                            alltext += sep + stext;
+                        }
+                        lineNo = lno;
+                    }
+                    alltext += "\n"; // Append line break after a line
+                }
+                alltext += "\n"; // Append line break at end of block
+                return alltext;
+            }
+
+            // For OCR output, we try a rudimentary table recognition.
+            List<List<string>> rows = new List<List<string>>();
+            List<float> xvalues = new List<float>();
+            int colCount = 0;
+
+            foreach (var (lrect, line) in lines.Select(l => (l.Rect, l.Spans)))
+            {
+                // If only 1 span in line and no columns identified yet...
+                if (line.Count == 1 && xvalues.Count == 0)
+                {
+                    alltext += line[0].Text + "\n\n\n";
+                    continue;
+                }
+                // Multiple spans in line and no columns identified yet
+                else if (xvalues.Count == 0) // Define column borders
+                {
+                    xvalues = line.Select(s => s.Bbox.X0).ToList();
+                    xvalues.Add(line[line.Count - 1].Bbox.X1);
+                    colCount = line.Count; // Number of columns
+                }
+
+                List<string> row = new List<string>(new string[colCount]);
+                foreach (var s in line)
+                {
+                    for (int i = 0; i < xvalues.Count - 1; i++)
+                    {
+                        float x0 = xvalues[i];
+                        float x1 = xvalues[i + 1];
+                        if (Math.Abs(s.Bbox.X0 - x0) <= 3 || Math.Abs(s.Bbox.X1 - x1) <= 3)
+                        {
+                            row[i] = s.Text;
+                        }
+                    }
+                }
+                rows.Add(row);
+            }
+
+            if (rows.Count > 0 && rows[0].Count > 0)
+            {
+                string header = "|" + string.Join("|", rows[0]) + "|\n";
+                alltext += header;
+                alltext += "|" + string.Join("|", Enumerable.Range(0, rows[0].Count).Select(_ => "---")) + "|\n";
+                for (int i = 1; i < rows.Count; i++)
+                {
+                    alltext += "|" + string.Join("|", rows[i]) + "|\n";
+                }
+                alltext += "\n";
+            }
+
+            return alltext;
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM/helpers/ImageQuality.cs b/MuPDF.NET4LLM/helpers/ImageQuality.cs
new file mode 100644
index 0000000..af6d64d
--- /dev/null
+++ b/MuPDF.NET4LLM/helpers/ImageQuality.cs
@@ -0,0 +1,694 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Numerics;
+
+namespace MuPDF.NET4LLM.Helpers
+{
+    /// <summary>
+    /// Image quality analysis utilities.
+    /// Ported and adapted from the Python module helpers/image_quality.py in pymupdf4llm.
+    /// </summary>
+    public static class ImageQuality
+    {
+        /// <summary>
+        /// Bilinear resize (similar to OpenCV INTER_LINEAR), vectorized implementation in Python.
+        /// </summary>
+        /// <param name="img">Input image (2D byte array).</param>
+        /// <param name="newH">New height.</param>
+        /// <param name="newW">New width.</param>
+        /// <returns>Resized image.</returns>
+        public static byte[,] ResizeBilinear(byte[,] img, int newH, int newW)
+        {
+            int h = img.GetLength(0);
+            int w = img.GetLength(1);
+            float[,] imgFloat = new float[h, w];
+            for (int y = 0; y < h; y++)
+                for (int x = 0; x < w; x++)
+                    imgFloat[y, x] = img[y, x];
+
+            // Target coordinates
+            float[] ys = new float[newH];
+            float[] xs = new float[newW];
+            for (int i = 0; i < newH; i++)
+                ys[i] = (i + 0.5f) * (h / (float)newH) - 0.5f;
+            for (int i = 0; i < newW; i++)
+                xs[i] = (i + 0.5f) * (w / (float)newW) - 0.5f;
+
+            for (int i = 0; i < newH; i++)
+                ys[i] = Math.Max(0, Math.Min(h - 1, ys[i]));
+            for (int i = 0; i < newW; i++)
+                xs[i] = Math.Max(0, Math.Min(w - 1, xs[i]));
+
+            int[] y0 = new int[newH];
+            int[] x0 = new int[newW];
+            for (int i = 0; i < newH; i++)
+                y0[i] = (int)Math.Floor(ys[i]);
+            for (int i = 0; i < newW; i++)
+                x0[i] = (int)Math.Floor(xs[i]);
+
+            int[] y1 = new int[newH];
+            int[] x1 = new int[newW];
+            for (int i = 0; i < newH; i++)
+                y1[i] = Math.Min(h - 1, y0[i] + 1);
+            for (int i = 0; i < newW; i++)
+                x1[i] = Math.Min(w - 1, x0[i] + 1);
+
+            byte[,] outImg = new byte[newH, newW];
+            for (int y = 0; y < newH; y++)
+            {
+                float wy = ys[y] - y0[y];
+                for (int x = 0; x < newW; x++)
+                {
+                    float wx = xs[x] - x0[x];
+                    // Four corner values via fancy indexing
+                    float Ia = imgFloat[y0[y], x0[x]]; // Top-left
+                    float Ib = imgFloat[y0[y], x1[x]]; // Top-right
+                    float Ic = imgFloat[y1[y], x0[x]]; // Bottom-left
+                    float Id = imgFloat[y1[y], x1[x]]; // Bottom-right
+
+                    float top = Ia * (1 - wx) + Ib * wx;
+                    float bottom = Ic * (1 - wx) + Id * wx;
+                    float val = top * (1 - wy) + bottom * wy;
+                    outImg[y, x] = (byte)Math.Max(0, Math.Min(255, val));
+                }
+            }
+            return outImg;
+        }
+
+        /// <summary>
+        /// 2D convolution (Cross-Correlation) with reflect padding.
+        /// Vectorized over kernel in Python.
+        /// </summary>
+        /// <param name="img">Input image.</param>
+        /// <param name="kernel">Convolution kernel.</param>
+        /// <returns>Convolved image.</returns>
+        public static float[,] Convolve2D(float[,] img, float[,] kernel)
+        {
+            int kh = kernel.GetLength(0);
+            int kw = kernel.GetLength(1);
+            int padH = kh / 2;
+            int padW = kw / 2;
+
+            int H = img.GetLength(0);
+            int W = img.GetLength(1);
+            float[,] padded = new float[H + 2 * padH, W + 2 * padW];
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < W; x++)
+                    padded[y + padH, x + padW] = img[y, x];
+
+            // Reflect padding
+            for (int y = 0; y < padH; y++)
+                for (int x = 0; x < W; x++)
+                    padded[y, x + padW] = img[padH - y, x];
+            for (int y = H + padH; y < H + 2 * padH; y++)
+                for (int x = 0; x < W; x++)
+                    padded[y, x + padW] = img[2 * H + padH - y - 1, x];
+            for (int y = 0; y < H + 2 * padH; y++)
+                for (int x = 0; x < padW; x++)
+                    padded[y, x] = padded[y, 2 * padW - x];
+            for (int y = 0; y < H + 2 * padH; y++)
+                for (int x = W + padW; x < W + 2 * padW; x++)
+                    padded[y, x] = padded[y, 2 * (W + padW) - x - 2];
+
+            float[,] output = new float[H, W];
+            // Loop only over kernel offsets, not over pixels
+            for (int i = 0; i < kh; i++)
+            {
+                for (int j = 0; j < kw; j++)
+                {
+                    for (int y = 0; y < H; y++)
+                    {
+                        for (int x = 0; x < W; x++)
+                        {
+                            output[y, x] += kernel[i, j] * padded[y + i, x + j];
+                        }
+                    }
+                }
+            }
+            return output;
+        }
+
+        /// <summary>
+        /// 1D Gaussian kernel
+        /// </summary>
+        public static float[] GaussianKernel1D(int size = 5, float sigma = 1.0f)
+        {
+            float[] kernel = new float[size];
+            int center = size / 2;
+            float sum = 0;
+            for (int i = 0; i < size; i++)
+            {
+                float x = i - center;
+                kernel[i] = (float)Math.Exp(-0.5 * (x / sigma) * (x / sigma));
+                sum += kernel[i];
+            }
+            for (int i = 0; i < size; i++)
+                kernel[i] /= sum;
+            return kernel;
+        }
+
+        /// <summary>
+        /// Separable Gaussian Blur: first horizontal, then vertical.
+        /// </summary>
+        /// <param name="img">Input image.</param>
+        /// <param name="ksize">Kernel size.</param>
+        /// <param name="sigma">Sigma value.</param>
+        /// <returns>Blurred image.</returns>
+        public static float[,] GaussianBlur(float[,] img, int ksize = 5, float sigma = 1.0f)
+        {
+            float[] kernel = GaussianKernel1D(ksize, sigma);
+            int H = img.GetLength(0);
+            int W = img.GetLength(1);
+            int pad = ksize / 2;
+
+            // Horizontal
+            float[,] padded = new float[H, W + 2 * pad];
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < W; x++)
+                    padded[y, x + pad] = img[y, x];
+            // Reflect padding
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < pad; x++)
+                    padded[y, x] = padded[y, 2 * pad - x];
+            for (int y = 0; y < H; y++)
+                for (int x = W + pad; x < W + 2 * pad; x++)
+                    padded[y, x] = padded[y, 2 * (W + pad) - x - 2];
+
+            float[,] tmp = new float[H, W];
+            for (int y = 0; y < H; y++)
+            {
+                for (int x = 0; x < W; x++)
+                {
+                    float sum = 0;
+                    for (int j = 0; j < ksize; j++)
+                        sum += kernel[j] * padded[y, x + j];
+                    tmp[y, x] = sum;
+                }
+            }
+
+            // Vertical
+            padded = new float[H + 2 * pad, W];
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < W; x++)
+                    padded[y + pad, x] = tmp[y, x];
+            // Reflect padding
+            for (int y = 0; y < pad; y++)
+                for (int x = 0; x < W; x++)
+                    padded[y, x] = padded[2 * pad - y, x];
+            for (int y = H + pad; y < H + 2 * pad; y++)
+                for (int x = 0; x < W; x++)
+                    padded[y, x] = padded[2 * (H + pad) - y - 2, x];
+
+            float[,] output = new float[H, W];
+            for (int y = 0; y < H; y++)
+            {
+                for (int x = 0; x < W; x++)
+                {
+                    float sum = 0;
+                    for (int i = 0; i < ksize; i++)
+                        sum += kernel[i] * padded[y + i, x];
+                    output[y, x] = sum;
+                }
+            }
+            return output;
+        }
+
+        /// <summary>
+        /// Sobel gradients in x/y, Magnitude and Angle.
+        /// </summary>
+        /// <param name="img">Input image.</param>
+        /// <returns>Magnitude and Angle matrices.</returns>
+        public static (float[,] mag, float[,] ang) SobelGradients(byte[,] img)
+        {
+            float[,] imgFloat = new float[img.GetLength(0), img.GetLength(1)];
+            for (int y = 0; y < img.GetLength(0); y++)
+                for (int x = 0; x < img.GetLength(1); x++)
+                    imgFloat[y, x] = img[y, x];
+
+            float[,] Kx = new float[,] { { -1, 0, 1 }, { -2, 0, 2 }, { -1, 0, 1 } };
+            float[,] Ky = new float[,] { { -1, -2, -1 }, { 0, 0, 0 }, { 1, 2, 1 } };
+
+            float[,] gx = Convolve2D(imgFloat, Kx);
+            float[,] gy = Convolve2D(imgFloat, Ky);
+
+            int H = img.GetLength(0);
+            int W = img.GetLength(1);
+            float[,] mag = new float[H, W];
+            float[,] ang = new float[H, W];
+            for (int y = 0; y < H; y++)
+            {
+                for (int x = 0; x < W; x++)
+                {
+                    mag[y, x] = (float)Math.Sqrt(gx[y, x] * gx[y, x] + gy[y, x] * gy[y, x]);
+                    ang[y, x] = (float)Math.Atan2(gy[y, x], gx[y, x]);
+                }
+            }
+            return (mag, ang);
+        }
+
+        /// <summary>
+        /// Shannon entropy check over 256-bin histogram.
+        /// </summary>
+        /// <param name="img">Input image.</param>
+        /// <param name="threshold">Entropy threshold.</param>
+        /// <returns>Entropy value and pass/fail status.</returns>
+        public static (double entropy, bool passed) EntropyCheck(byte[,] img, double threshold = 5.0)
+        {
+            int[] hist = new int[256];
+            int H = img.GetLength(0);
+            int W = img.GetLength(1);
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < W; x++)
+                    hist[img[y, x]]++;
+
+            double total = H * W;
+            double entropy = 0;
+            for (int i = 0; i < 256; i++)
+            {
+                if (hist[i] > 0)
+                {
+                    double p = hist[i] / total;
+                    entropy -= p * Math.Log(p, 2);
+                }
+            }
+            return (entropy, entropy >= threshold);
+        }
+
+        /// <summary>
+        /// Low-Frequency-Ratio in FFT spectrum.
+        /// Internally rescales to 128x128.
+        /// </summary>
+        /// <param name="imgGray">Input grayscale image.</param>
+        /// <param name="threshold">Ratio threshold.</param>
+        /// <returns>Ratio value and pass/fail status.</returns>
+        public static (double ratio, bool passed) FftCheck(byte[,] imgGray, double threshold = 0.15)
+        {
+            byte[,] small = ResizeBilinear(imgGray, 128, 128);
+            Complex[,] f = Fft2D(small);
+            Complex[,] fshift = FftShift(f);
+            double[,] magnitude = new double[128, 128];
+            for (int y = 0; y < 128; y++)
+                for (int x = 0; x < 128; x++)
+                    magnitude[y, x] = fshift[y, x].Magnitude;
+
+            int h = 128, w = 128;
+            double centerSum = 0;
+            double totalSum = 0;
+            for (int y = h / 4; y < 3 * h / 4; y++)
+                for (int x = w / 4; x < 3 * w / 4; x++)
+                    centerSum += magnitude[y, x];
+            for (int y = 0; y < h; y++)
+                for (int x = 0; x < w; x++)
+                    totalSum += magnitude[y, x];
+
+            double ratio = centerSum / totalSum;
+            return (ratio, ratio < threshold);
+        }
+
+        /// <summary>
+        /// Simple 2D FFT using System.Numerics
+        /// </summary>
+        private static Complex[,] Fft2D(byte[,] img)
+        {
+            int H = img.GetLength(0);
+            int W = img.GetLength(1);
+            Complex[,] result = new Complex[H, W];
+            for (int y = 0; y < H; y++)
+            {
+                for (int x = 0; x < W; x++)
+                {
+                    // Simplified FFT - for production, use a proper FFT library
+                    // This is a placeholder that converts to complex
+                    result[y, x] = new Complex(img[y, x], 0);
+                }
+            }
+            // Note: Full 2D FFT implementation would be needed for production
+            return result;
+        }
+
+        private static Complex[,] FftShift(Complex[,] f)
+        {
+            int H = f.GetLength(0);
+            int W = f.GetLength(1);
+            Complex[,] shifted = new Complex[H, W];
+            int h2 = H / 2, w2 = W / 2;
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < W; x++)
+                    shifted[(y + h2) % H, (x + w2) % W] = f[y, x];
+            return shifted;
+        }
+
+        /// <summary>
+        /// Otsu Thresholding.
+        /// </summary>
+        /// <param name="img">Input image.</param>
+        /// <returns>Binary image (0 or 255).</returns>
+        public static byte[,] OtsuThreshold(byte[,] img)
+        {
+            int[] hist = new int[256];
+            int H = img.GetLength(0);
+            int W = img.GetLength(1);
+            int total = H * W;
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < W; x++)
+                    hist[img[y, x]]++;
+
+            long sumTotal = 0;
+            for (int i = 0; i < 256; i++)
+                sumTotal += i * hist[i];
+
+            long sumB = 0;
+            long wB = 0;
+            double maxVar = 0;
+            int threshold = 0;
+
+            for (int t = 0; t < 256; t++)
+            {
+                wB += hist[t];
+                if (wB == 0) continue;
+                long wF = total - wB;
+                if (wF == 0) break;
+
+                sumB += t * hist[t];
+                double mB = sumB / (double)wB;
+                double mF = (sumTotal - sumB) / (double)wF;
+                double varBetween = wB * wF * (mB - mF) * (mB - mF);
+
+                if (varBetween > maxVar)
+                {
+                    maxVar = varBetween;
+                    threshold = t;
+                }
+            }
+
+            byte[,] binary = new byte[H, W];
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < W; x++)
+                    binary[y, x] = (byte)(img[y, x] > threshold ? 255 : 0);
+            return binary;
+        }
+
+        /// <summary>
+        /// 8-connectivity Connected Components, Union-Find based two-pass approach.
+        /// </summary>
+        /// <param name="binaryImg">Input binary image (0 background, !=0 foreground).</param>
+        /// <param name="threshold">Minimum component count threshold.</param>
+        /// <returns>Component count and pass/fail status.</returns>
+        public static (int components, bool passed) ComponentsCheck(byte[,] binaryImg, int threshold = 10)
+        {
+            int H = binaryImg.GetLength(0);
+            int W = binaryImg.GetLength(1);
+            int[,] labels = new int[H, W];
+            int maxLabels = H * W / 2 + 1;
+            int[] parent = new int[maxLabels];
+            int[] rank = new int[maxLabels];
+            for (int i = 0; i < maxLabels; i++)
+                parent[i] = i;
+
+            int nextLabel = 1;
+
+            int Find(int x)
+            {
+                while (parent[x] != x)
+                {
+                    parent[x] = parent[parent[x]];
+                    x = parent[x];
+                }
+                return x;
+            }
+
+            void Union(int a, int b)
+            {
+                int ra = Find(a);
+                int rb = Find(b);
+                if (ra == rb) return;
+                if (rank[ra] < rank[rb])
+                    parent[ra] = rb;
+                else if (rank[ra] > rank[rb])
+                    parent[rb] = ra;
+                else
+                {
+                    parent[rb] = ra;
+                    rank[ra]++;
+                }
+            }
+
+            // First pass
+            for (int y = 0; y < H; y++)
+            {
+                for (int x = 0; x < W; x++)
+                {
+                    if (binaryImg[y, x] == 0) continue;
+
+                    List<int> neighbors = new List<int>();
+                    int[] dy = { -1, -1, -1, 0 };
+                    int[] dx = { -1, 0, 1, -1 };
+                    for (int i = 0; i < 4; i++)
+                    {
+                        int ny = y + dy[i];
+                        int nx = x + dx[i];
+                        if (ny >= 0 && ny < H && nx >= 0 && nx < W && labels[ny, nx] > 0)
+                            neighbors.Add(labels[ny, nx]);
+                    }
+
+                    if (neighbors.Count == 0)
+                    {
+                        labels[y, x] = nextLabel++;
+                    }
+                    else
+                    {
+                        int m = neighbors.Min();
+                        labels[y, x] = m;
+                        foreach (int n in neighbors)
+                            if (n != m) Union(m, n);
+                    }
+                }
+            }
+
+            // Second pass: Label flattening
+            Dictionary<int, int> labelMap = new Dictionary<int, int>();
+            int current = 1;
+            for (int y = 0; y < H; y++)
+            {
+                for (int x = 0; x < W; x++)
+                {
+                    if (labels[y, x] > 0)
+                    {
+                        int root = Find(labels[y, x]);
+                        if (!labelMap.ContainsKey(root))
+                            labelMap[root] = current++;
+                        labels[y, x] = labelMap[root];
+                    }
+                }
+            }
+
+            int components = current - 1;
+            return (components, components >= threshold);
+        }
+
+        /// <summary>
+        /// Non-maximum suppression
+        /// </summary>
+        public static float[,] NonMaxSuppression(float[,] mag, float[,] ang)
+        {
+            int H = mag.GetLength(0);
+            int W = mag.GetLength(1);
+            float[,] Z = new float[H, W];
+            float[,] angDeg = new float[H, W];
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < W; x++)
+                {
+                    angDeg[y, x] = ang[y, x] * 180.0f / (float)Math.PI;
+                    if (angDeg[y, x] < 0) angDeg[y, x] += 180;
+                }
+
+            // Direction quantization
+            // 0°, 45°, 90°, 135°
+
+            for (int y = 1; y < H - 1; y++)
+            {
+                for (int x = 1; x < W - 1; x++)
+                {
+                    float angle = angDeg[y, x];
+                    float m0 = mag[y, x];
+                    float m1 = 0, m2 = 0;
+
+                    // Helper function: compares with two neighbors in given direction
+                    if ((angle >= 0 && angle < 22.5) || (angle >= 157.5 && angle <= 180))
+                    {
+                        m1 = mag[y, x - 1];
+                        m2 = mag[y, x + 1];
+                    }
+                    else if (angle >= 22.5 && angle < 67.5)
+                    {
+                        m1 = mag[y - 1, x + 1];
+                        m2 = mag[y + 1, x - 1];
+                    }
+                    else if (angle >= 67.5 && angle < 112.5)
+                    {
+                        m1 = mag[y - 1, x];
+                        m2 = mag[y + 1, x];
+                    }
+                    else if (angle >= 112.5 && angle < 157.5)
+                    {
+                        m1 = mag[y - 1, x - 1];
+                        m2 = mag[y + 1, x + 1];
+                    }
+
+                    if (m0 >= m1 && m0 >= m2)
+                        Z[y, x] = m0;
+                }
+            }
+            return Z;
+        }
+
+        /// <summary>
+        /// Hysteresis thresholding
+        /// </summary>
+        public static byte[,] HysteresisThresholding(float[,] img, float low, float high)
+        {
+            int H = img.GetLength(0);
+            int W = img.GetLength(1);
+            byte strongVal = 255;
+            byte weakVal = 50;
+            byte[,] result = new byte[H, W];
+
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < W; x++)
+                {
+                    if (img[y, x] >= high)
+                        result[y, x] = strongVal;
+                    else if (img[y, x] >= low)
+                        result[y, x] = weakVal;
+                }
+
+            bool changed = true;
+            while (changed)
+            {
+                changed = false;
+
+                // Neighborhood of a strong pixel:
+                // 8-neighborhood via shifts
+                // Weak pixels that border strong become strong
+                for (int y = 1; y < H - 1; y++)
+                {
+                    for (int x = 1; x < W - 1; x++)
+                    {
+                        if (result[y, x] == weakVal)
+                        {
+                            bool hasStrong = false;
+                            for (int dy = -1; dy <= 1; dy++)
+                            {
+                                for (int dx = -1; dx <= 1; dx++)
+                                {
+                                    if (dx == 0 && dy == 0) continue;
+                                    if (result[y + dy, x + dx] == strongVal)
+                                    {
+                                        hasStrong = true;
+                                        break;
+                                    }
+                                }
+                                if (hasStrong) break;
+                            }
+                            if (hasStrong)
+                            {
+                                result[y, x] = strongVal;
+                                changed = true;
+                            }
+                        }
+                    }
+                }
+            }
+
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < W; x++)
+                    if (result[y, x] != strongVal)
+                        result[y, x] = 0;
+
+            return result;
+        }
+
+        /// <summary>
+        /// Full Canny Edge Detector.
+        /// </summary>
+        /// <param name="img">Input image.</param>
+        /// <param name="low">Low threshold.</param>
+        /// <param name="high">High threshold.</param>
+        /// <returns>Edge image.</returns>
+        public static byte[,] CannyNumPy(byte[,] img, float low = 50.0f, float high = 100.0f)
+        {
+            float[,] imgFloat = new float[img.GetLength(0), img.GetLength(1)];
+            for (int y = 0; y < img.GetLength(0); y++)
+                for (int x = 0; x < img.GetLength(1); x++)
+                    imgFloat[y, x] = img[y, x];
+
+            float[,] blur = GaussianBlur(imgFloat, 5, 1.0f);
+            byte[,] blurByte = new byte[blur.GetLength(0), blur.GetLength(1)];
+            for (int y = 0; y < blur.GetLength(0); y++)
+                for (int x = 0; x < blur.GetLength(1); x++)
+                    blurByte[y, x] = (byte)Math.Max(0, Math.Min(255, blur[y, x]));
+
+            var (mag, ang) = SobelGradients(blurByte);
+            float[,] nms = NonMaxSuppression(mag, ang);
+            byte[,] edges = HysteresisThresholding(nms, low, high);
+            return edges;
+        }
+
+        /// <summary>
+        /// Edge density check: mean(edges)/255.0.
+        /// </summary>
+        /// <param name="edges">Input edge image (0/255).</param>
+        /// <param name="threshold">Density threshold.</param>
+        /// <returns>Density value and pass/fail status.</returns>
+        public static (double density, bool passed) EdgeDensityCheck(byte[,] edges, double threshold = 0.2)
+        {
+            int H = edges.GetLength(0);
+            int W = edges.GetLength(1);
+            long sum = 0;
+            for (int y = 0; y < H; y++)
+                for (int x = 0; x < W; x++)
+                    sum += edges[y, x];
+            double density = sum / (255.0 * H * W);
+            return (density, density >= threshold);
+        }
+
+        /// <summary>
+        /// Runs all four checks and calculates weighted score.
+        /// </summary>
+        /// <param name="imgGray">Input 2D byte array (grayscale).</param>
+        /// <returns>Dictionary with analysis results.</returns>
+        public static Dictionary<string, (double value, bool passed)> AnalyzeImage(byte[,] imgGray)
+        {
+            // 1) Entropy
+            var (entropyVal, entropyOk) = EntropyCheck(imgGray);
+
+            // 2) FFT ratio
+            var (fftRatio, fftOk) = FftCheck(imgGray);
+
+            // 3) Components
+            byte[,] binary = OtsuThreshold(imgGray);
+            var (componentsCnt, componentsOk) = ComponentsCheck(binary);
+
+            // 4) Edges
+            byte[,] edges = CannyNumPy(imgGray);
+            var (edgeDensity, edgesOk) = EdgeDensityCheck(edges);
+
+            // Weighted score
+            int score = 0;
+            if (componentsOk) score += 2;
+            if (edgesOk) score += 2;
+            if (entropyOk) score += 1;
+            if (fftOk) score += 1;
+
+            return new Dictionary<string, (double value, bool passed)>
+            {
+                ["entropy"] = (entropyVal, entropyOk),
+                ["fft_ratio"] = (fftRatio, fftOk),
+                ["components"] = (componentsCnt, componentsOk),
+                ["edge_density"] = (edgeDensity, edgesOk),
+                ["score"] = (score, false),
+            };
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM/helpers/MuPdfRag.cs b/MuPDF.NET4LLM/helpers/MuPdfRag.cs
new file mode 100644
index 0000000..3994783
--- /dev/null
+++ b/MuPDF.NET4LLM/helpers/MuPdfRag.cs
@@ -0,0 +1,1619 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+using MuPDF.NET;
+
+namespace MuPDF.NET4LLM.Helpers
+{
+    /// <summary>
+    /// Header identification based on font sizes
+    /// </summary>
+    public class IdentifyHeaders
+    {
+        private Dictionary<int, string> _headerId = new Dictionary<int, string>();
+        private float _bodyLimit;
+
+        public IdentifyHeaders(
+            object doc, // Document or string
+            List<int> pages = null,
+            float bodyLimit = 12.0f, // Force this to be body text
+            int maxLevels = 6) // Accept this many header levels
+        {
+            if (maxLevels < 1 || maxLevels > 6)
+                throw new ArgumentException("max_levels must be between 1 and 6");
+
+            Document mydoc = doc as Document;
+            if (mydoc == null)
+            {
+                mydoc = new Document(doc.ToString());
+            }
+
+            // Remove StructTreeRoot to avoid possible performance degradation
+            // We will not use the structure tree anyway.
+            if (pages == null) // Use all pages if omitted
+                pages = Enumerable.Range(0, mydoc.PageCount).ToList();
+
+            Dictionary<int, int> fontSizes = new Dictionary<int, int>();
+
+            foreach (int pno in pages)
+            {
+                Page page = mydoc.LoadPage(pno);
+                // Use TEXTFLAGS_TEXT for proper text extraction (matches Python TEXTFLAGS_TEXT)
+                int textFlags = (int)TextFlagsExtension.TEXTFLAGS_TEXT;
+                TextPage textPage = page.GetTextPage(flags: textFlags);
+                PageInfo pageInfo = textPage.ExtractDict(null, false);
+
+                // Look at all non-empty horizontal spans
+                foreach (var block in pageInfo.Blocks ?? new List<Block>())
+                {
+                    if (block.Type != 0) continue;
+                    if (block.Lines == null) continue;
+
+                    foreach (var line in block.Lines)
+                    {
+                        if (line.Spans == null) continue;
+                        foreach (var span in line.Spans)
+                        {
+                            string text = span.Text ?? "";
+                            if (Utils.IsWhite(text)) continue;
+
+                            int fontSz = (int)Math.Round(span.Size); // Compute rounded fontsize
+                            if (!fontSizes.ContainsKey(fontSz))
+                                fontSizes[fontSz] = 0;
+                            fontSizes[fontSz] += text.Trim().Length; // Add character count
+                        }
+                    }
+                }
+
+                textPage.Dispose();
+                page.Dispose();
+            }
+
+            if (mydoc != doc as Document)
+                // If opened here, close it now
+                mydoc.Close();
+
+            // Maps a fontsize to a string of multiple # header tag characters
+            // If not provided, choose the most frequent font size as body text.
+            // If no text at all on all pages, just use body_limit.
+            // In any case all fonts not exceeding
+            var sorted = fontSizes.OrderBy(kvp => (kvp.Value, kvp.Key)).ToList();
+            if (sorted.Count > 0)
+            {
+                // Most frequent font size
+                _bodyLimit = Math.Max(bodyLimit, sorted[sorted.Count - 1].Key);
+            }
+            else
+            {
+                _bodyLimit = bodyLimit;
+            }
+
+            // Identify up to 6 font sizes as header candidates
+            var sizes = fontSizes.Keys
+                .Where(f => f > _bodyLimit)
+                .OrderByDescending(f => f)
+                .Take(maxLevels)
+                .ToList();
+
+            // Make the header tag dictionary
+            for (int i = 0; i < sizes.Count; i++)
+            {
+                _headerId[sizes[i]] = new string('#', i + 1) + " ";
+            }
+
+            if (_headerId.Count > 0)
+                _bodyLimit = _headerId.Keys.Min() - 1;
+        }
+
+        /// <summary>
+        /// Return appropriate markdown header prefix.
+        /// Given a text span from a "dict"/"rawdict" extraction, determine the
+        /// markdown header prefix string of 0 to n concatenated '#' characters.
+        /// </summary>
+        public string GetHeaderId(ExtendedSpan span, Page page = null)
+        {
+            int fontsize = (int)Math.Round(span.Size); // Compute fontsize
+            if (fontsize <= _bodyLimit)
+                return "";
+            string hdrId = _headerId.ContainsKey(fontsize) ? _headerId[fontsize] : "";
+            return hdrId;
+        }
+    }
+
+    /// <summary>
+    /// Header identification based on Table of Contents
+    /// </summary>
+    public class TocHeaders
+    {
+        private List<Toc> _toc;
+
+        /// <summary>
+        /// Read and store the TOC of the document.
+        /// </summary>
+        public TocHeaders(object doc)
+        {
+            Document mydoc = doc as Document;
+            if (mydoc == null)
+            {
+                mydoc = new Document(doc.ToString());
+            }
+
+            _toc = mydoc.GetToc();
+
+            if (mydoc != doc as Document)
+                // If opened here, close it now
+                mydoc.Close();
+        }
+
+        /// <summary>
+        /// Return appropriate markdown header prefix.
+        /// Given a text span from a "dict"/"rawdict" extraction, determine the
+        /// markdown header prefix string of 0 to n concatenated '#' characters.
+        /// </summary>
+        public string GetHeaderId(ExtendedSpan span, Page page = null)
+        {
+            if (page == null)
+                return "";
+            // Check if this page has TOC entries with an actual title
+            var myToc = _toc.Where(t => !string.IsNullOrEmpty(t.Title) && t.Page == page.Number + 1).ToList();
+            if (myToc.Count == 0) // No TOC items present on this page
+                return "";
+            // Check if the span matches a TOC entry. This must be done in the
+            // most forgiving way: exact matches are rare animals.
+            string text = (span.Text ?? "").Trim(); // Remove leading and trailing whitespace
+            foreach (var t in myToc)
+            {
+                string title = t.Title.Trim(); // Title of TOC entry
+                int lvl = t.Level; // Level of TOC entry
+                if (text.StartsWith(title) || title.StartsWith(text))
+                {
+                    // Found a match: return the header tag
+                    return new string('#', lvl) + " ";
+                }
+            }
+            return "";
+        }
+    }
+
+    /// <summary>
+    /// Parameters class to store page-specific information (matches Python dataclass)
+    /// </summary>
+    public class Parameters
+    {
+        public Page Page { get; set; }
+        public string Filename { get; set; }
+        public string MdString { get; set; } = "";
+        public List<object> Images { get; set; } = new List<object>();
+        public List<object> Tables { get; set; } = new List<object>();
+        public List<object> Graphics { get; set; } = new List<object>();
+        public List<object> Words { get; set; } = new List<object>();
+        public List<Rect> LineRects { get; set; } = new List<Rect>();
+        public bool AcceptInvisible { get; set; }
+        public float[] BgColor { get; set; }
+        public Rect Clip { get; set; }
+        public List<LinkInfo> Links { get; set; } = new List<LinkInfo>();
+        public List<Rect> AnnotRects { get; set; } = new List<Rect>();
+        public TextPage TextPage { get; set; }
+        public List<Rect> ImgRects { get; set; } = new List<Rect>();
+        public List<Rect> TabRects0 { get; set; } = new List<Rect>();
+        public Dictionary<int, Rect> TabRects { get; set; } = new Dictionary<int, Rect>();
+        public List<Table> Tabs { get; set; } = new List<Table>();
+        public List<int> WrittenTables { get; set; } = new List<int>();
+        public List<int> WrittenImages { get; set; } = new List<int>();
+        public List<PathInfo> ActualPaths { get; set; } = new List<PathInfo>();
+        public List<Rect> VgClusters0 { get; set; } = new List<Rect>();
+        public Dictionary<int, Rect> VgClusters { get; set; } = new Dictionary<int, Rect>();
+    }
+
+    /// <summary>
+    /// Main markdown conversion utilities.
+    /// Ported and adapted from the Python module helpers/pymupdf_rag.py in pymupdf4llm.
+    /// </summary>
+    public static class MuPdfRag
+    {
+        private const string GRAPHICS_TEXT = "\n![]({0})\n";
+
+        /// <summary>
+        /// Convert a document to Markdown, closely following the behavior of
+        /// <c>pymupdf4llm.helpers.pymupdf_rag.ToMarkdown</c>.
+        /// </summary>
+        /// <param name="doc">Input <see cref="Document"/> to convert.</param>
+        /// <param name="pages">
+        /// Page numbers (0‑based) to process. When <c>null</c>, all pages are processed.
+        /// </param>
+        /// <param name="hdrInfo">
+        /// Optional header resolver used to create Markdown headings. This can be
+        /// an <see cref="IdentifyHeaders"/> instance, a <see cref="TocHeaders"/> instance,
+        /// or <c>null</c> to auto‑detect headers.
+        /// </param>
+        /// <param name="writeImages">
+        /// When <c>true</c>, images are written to disk and referenced by relative path.
+        /// </param>
+        /// <param name="embedImages">
+        /// When <c>true</c>, images are embedded as <c>data:</c> URLs in the Markdown.
+        /// Cannot be combined with <paramref name="writeImages"/>.
+        /// </param>
+        /// <param name="ignoreImages">
+        /// When <c>true</c>, image regions are ignored entirely (no image and no OCR text).
+        /// </param>
+        /// <param name="ignoreGraphics">
+        /// When <c>true</c>, vector graphics are ignored (no layout‑based table / column hints).
+        /// </param>
+        /// <param name="detectBgColor">
+        /// When <c>true</c>, tries to detect a uniform page background to filter
+        /// out large background rectangles from graphics analysis.
+        /// </param>
+        /// <param name="imagePath">
+        /// Target directory for written images when <paramref name="writeImages"/> is <c>true</c>.
+        /// </param>
+        /// <param name="imageFormat">Image file format, e.g. <c>&quot;png&quot;</c> or <c>&quot;jpg&quot;</c>.</param>
+        /// <param name="imageSizeLimit">
+        /// Minimum relative size (\(0 \leq v &lt; 1\)) of images with respect to the page
+        /// before they are considered for output.
+        /// </param>
+        /// <param name="filename">
+        /// Logical filename used in image names and metadata; defaults to <see cref="Document.Name"/>.
+        /// </param>
+        /// <param name="forceText">
+        /// When <c>true</c>, attempts to also extract text from image regions (e.g. diagrams)
+        /// in addition to placing images.
+        /// </param>
+        /// <param name="pageChunks">
+        /// When <c>true</c>, returns a JSON string describing per‑page “chunks” instead of raw Markdown.
+        /// </param>
+        /// <param name="pageSeparators">
+        /// When <c>true</c>, appends an explicit <c>--- end of page=...</c> marker after each page.
+        /// </param>
+        /// <param name="margins">
+        /// Optional margins in points. One value applies to all sides, two values to
+        /// top/bottom and left/right, and four values to left, top, right, bottom.
+        /// </param>
+        /// <param name="dpi">
+        /// Resolution used for image extraction where a <see cref="Pixmap"/> is rendered.
+        /// </param>
+        /// <param name="pageWidth">
+        /// Page width used for reflowable documents when <see cref="Document.IsReflowable"/> is <c>true</c>.
+        /// </param>
+        /// <param name="pageHeight">
+        /// Optional page height for reflowable documents. If <c>null</c>, a single tall page
+        /// covering the whole document is created.
+        /// </param>
+        /// <param name="tableStrategy">
+        /// Table detection strategy passed to <c>Page.GetTables</c>, e.g. <c>&quot;lines_strict&quot;</c>
+        /// to mimic the Python default.
+        /// </param>
+        /// <param name="graphicsLimit">
+        /// Optional upper bound on the number of path objects before graphics are ignored
+        /// for layout analysis (similar to <c>graphics_limit</c> in Python).
+        /// </param>
+        /// <param name="fontsizeLimit">
+        /// Minimum font size considered as “normal” text when computing some heuristics.
+        /// </param>
+        /// <param name="ignoreCode">
+        /// When <c>true</c>, code blocks (mono‑spaced text) are not emitted as fenced code blocks.
+        /// </param>
+        /// <param name="extractWords">
+        /// When <c>true</c>, the return value is a JSON description of page “chunks” with
+        /// word positions, matching the Python <c>extract_words</c> mode.
+        /// </param>
+        /// <param name="showProgress">
+        /// When <c>true</c>, prints a simple progress bar while processing pages.
+        /// </param>
+        /// <param name="useGlyphs">
+        /// When <c>true</c>, uses glyph IDs for unknown Unicode characters, similar to
+        /// <c>FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE</c> in the C API.
+        /// </param>
+        /// <param name="ignoreAlpha">
+        /// When <c>true</c>, treats fully transparent text as visible (affects OCR heuristics).
+        /// </param>
+        /// <returns>
+        /// Markdown text for the selected pages, or a JSON string describing page chunks
+        /// when <paramref name="pageChunks"/> / <paramref name="extractWords"/> is enabled.
+        /// </returns>
+        public static string ToMarkdown(
+            Document doc,
+            List<int> pages = null,
+            object hdrInfo = null, // Can be IdentifyHeaders, TocHeaders, or null
+            bool writeImages = false,
+            bool embedImages = false,
+            bool ignoreImages = false,
+            bool ignoreGraphics = false,
+            bool detectBgColor = true,
+            string imagePath = "",
+            string imageFormat = "png",
+            float imageSizeLimit = 0.05f,
+            string filename = null,
+            bool forceText = true,
+            bool pageChunks = false,
+            bool pageSeparators = false,
+            List<float> margins = null,
+            int dpi = 150,
+            float pageWidth = 612,
+            float? pageHeight = null,
+            string tableStrategy = "lines_strict",
+            int? graphicsLimit = null,
+            float fontsizeLimit = 3.0f,
+            bool ignoreCode = false,
+            bool extractWords = false,
+            bool showProgress = false,
+            bool useGlyphs = false,
+            bool ignoreAlpha = false)
+        {
+            if (!writeImages && !embedImages && !forceText)
+                throw new ArgumentException("Images and text on images cannot both be suppressed.");
+            if (embedImages)
+            {
+                writeImages = false;
+                imagePath = string.Empty;
+            }
+            if (imageSizeLimit < 0 || imageSizeLimit >= 1)
+            {
+                throw new ArgumentOutOfRangeException(
+                    nameof(imageSizeLimit),
+                    "'imageSizeLimit' must be non-negative and less than 1.");
+            }
+
+            int DPI = dpi;
+            bool IGNORE_CODE = ignoreCode;
+            string IMG_EXTENSION = imageFormat;
+            bool EXTRACT_WORDS = extractWords;
+            if (EXTRACT_WORDS)
+            {
+                pageChunks = true;
+                ignoreCode = true;
+            }
+            string IMG_PATH = imagePath;
+            if (!string.IsNullOrEmpty(IMG_PATH) && writeImages && !Directory.Exists(IMG_PATH))
+                Directory.CreateDirectory(IMG_PATH);
+
+            string FILENAME = filename ?? doc.Name;
+            // Assign configuration
+            int? GRAPHICS_LIMIT = graphicsLimit;
+            double FONTSIZE_LIMIT = fontsizeLimit;
+            bool IGNORE_IMAGES = ignoreImages;
+            bool IGNORE_GRAPHICS = ignoreGraphics;
+            bool DETECT_BG_COLOR = detectBgColor;
+
+            if (filename == null)
+                filename = doc.Name;
+
+            // Handle form PDFs and documents with annotations
+            if (doc.IsFormPDF > 0 || (doc.IsPDF && doc.HasAnnots()))
+            {
+                doc.Bake();
+            }
+
+            // For reflowable documents, allow making 1 page for the whole document
+            if (doc.IsReflowable)
+            {
+                if (pageHeight.HasValue)
+                {
+                    // Accept user page dimensions
+                    doc.SetLayout(width: pageWidth, height: pageHeight.Value);
+                }
+                else
+                {
+                    // No page height limit given: make 1 page for whole document
+                    doc.SetLayout(width: pageWidth, height: 792);
+                    int pageCount = doc.PageCount;
+                    float height = 792 * pageCount; // Height that covers full document
+                    doc.SetLayout(width: pageWidth, height: height);
+                }
+            }
+
+            if (pages == null) // Use all pages if no selection given
+                pages = Enumerable.Range(0, doc.PageCount).ToList();
+
+            // Process margins: convert to 4-element list
+            if (margins == null)
+                margins = new List<float> { 0, 0, 0, 0 };
+            else if (margins.Count == 1)
+                margins = new List<float> { margins[0], margins[0], margins[0], margins[0] };
+            else if (margins.Count == 2)
+                margins = new List<float> { 0, margins[0], 0, margins[1] };
+            else if (margins.Count != 4)
+                throw new ArgumentException("margins must be one, two or four floats");
+
+            // If "hdr_info" is not an object with a method "get_header_id", scan the
+            // document and use font sizes as header level indicators.
+            Func<ExtendedSpan, Page, string> getHeaderId;
+
+            if (hdrInfo is IdentifyHeaders idHdr)
+                getHeaderId = idHdr.GetHeaderId;
+            else if (hdrInfo is TocHeaders tocHdr)
+                getHeaderId = tocHdr.GetHeaderId;
+            else if (hdrInfo == null)
+            {
+                var idHdr2 = new IdentifyHeaders(doc, pages);
+                getHeaderId = idHdr2.GetHeaderId;
+            }
+            else
+                getHeaderId = (s, p) => "";
+
+            // Initialize output based on page_chunks mode
+            object documentOutput;
+            if (!pageChunks)
+            {
+                documentOutput = new StringBuilder();
+            }
+            else
+            {
+                documentOutput = new List<Dictionary<string, object>>();
+            }
+
+            // Read the Table of Contents
+            List<Toc> toc = doc.GetToc();
+
+            // Text extraction flags: omit clipped text, collect styles
+            int textFlags = (int)TextFlags.TEXT_MEDIABOX_CLIP | 
+                           (int)mupdf.mupdf.FZ_STEXT_COLLECT_STYLES;
+            
+            // Optionally replace REPLACEMENT_CHARACTER by glyph number
+            if (useGlyphs)
+            {
+                textFlags |= (int)mupdf.mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE;
+            }
+
+            // Note: Table FLAGS would be set here if we had access to pymupdf.table.FLAGS
+            // In C#, this would need to be handled differently if table extraction uses flags
+
+            var progressBar = showProgress && pages.Count > 5
+                ? ProgressBar.Create(pages.Cast<object>().ToList())
+                : null;
+
+            try
+            {
+                if (showProgress)
+                {
+                    Console.WriteLine($"Processing {FILENAME}...");
+                }
+
+                foreach (int pno in pages)
+                {
+                    if (progressBar != null && !progressBar.MoveNext())
+                        break;
+
+                    Parameters pageParms = GetPageOutput(
+                        doc, pno, margins, getHeaderId, writeImages, embedImages, ignoreImages,
+                        imagePath, imageFormat, filename, forceText, dpi, ignoreCode,
+                        ignoreGraphics, tableStrategy, detectBgColor, graphicsLimit,
+                        ignoreAlpha, extractWords, pageSeparators, imageSizeLimit, textFlags);
+
+                    if (!pageChunks)
+                    {
+                        ((StringBuilder)documentOutput).Append(pageParms.MdString);
+                    }
+                    else
+                    {
+                        // Build subset of TOC for this page
+                        var pageTocs = toc.Where(t => t.Page == pno + 1).ToList();
+
+                        var metadata = GetMetadata(doc, pno, FILENAME);
+                        
+                        var pageChunk = new Dictionary<string, object>
+                        {
+                            ["metadata"] = metadata,
+                            ["toc_items"] = pageTocs,
+                            ["tables"] = pageParms.Tables,
+                            ["images"] = pageParms.Images,
+                            ["graphics"] = pageParms.Graphics,
+                            ["text"] = pageParms.MdString,
+                            ["words"] = pageParms.Words
+                        };
+                        
+                        ((List<Dictionary<string, object>>)documentOutput).Add(pageChunk);
+                    }
+                }
+            }
+            finally
+            {
+                progressBar?.Dispose();
+            }
+
+            if (!pageChunks)
+            {
+                return ((StringBuilder)documentOutput).ToString();
+            }
+            else
+            {
+                // For page_chunks mode, we need to return a structured format
+                // Since System.Text.Json may not be available in all .NET versions,
+                // we'll use Newtonsoft.Json if available, or return a simple string representation
+                try
+                {
+                    return Newtonsoft.Json.JsonConvert.SerializeObject(documentOutput, Newtonsoft.Json.Formatting.Indented);
+                }
+                catch
+                {
+                    // Fallback: return a simple string representation
+                    var sb = new StringBuilder();
+                    foreach (var chunk in (List<Dictionary<string, object>>)documentOutput)
+                    {
+                        sb.AppendLine("--- Page Chunk ---");
+                        foreach (var kvp in chunk)
+                        {
+                            sb.AppendLine($"{kvp.Key}: {kvp.Value}");
+                        }
+                        sb.AppendLine();
+                    }
+                    return sb.ToString();
+                }
+            }
+        }
+
+        /// <summary>
+        /// Get maximum header ID from spans (matches Python max_header_id)
+        /// </summary>
+        private static string MaxHeaderId(
+            List<ExtendedSpan> spans,
+            Page page,
+            Func<ExtendedSpan, Page, string> getHeaderId)
+        {
+            var hdrIds = spans
+                .Select(s => getHeaderId(s, page))
+                .Where(h => !string.IsNullOrEmpty(h))
+                .Select(h => h.Trim().Length)
+                .Where(l => l > 0)
+                .Distinct()
+                .OrderBy(l => l)
+                .ToList();
+
+            if (hdrIds.Count == 0)
+                return "";
+
+            // Return header tag with one less '#' than the minimum found
+            return new string('#', hdrIds[0] - 1) + " ";
+        }
+
+        /// <summary>
+        /// Accept a span and return a markdown link string.
+        /// A link should overlap at least 70% of the span.
+        /// </summary>
+        private static string ResolveLinks(List<LinkInfo> links, ExtendedSpan span)
+        {
+            if (links == null || links.Count == 0 || span == null || span.Bbox == null)
+                return null;
+
+            Rect spanBbox = span.Bbox; // Span bbox
+
+            foreach (var link in links)
+            {
+                // Only process URI links
+                if (link.Kind != LinkType.LINK_URI || string.IsNullOrEmpty(link.Uri))
+                    continue;
+
+                if (link.From == null)
+                    continue;
+
+                // The hot area of the link
+                // Middle point of hot area
+                float middleX = (link.From.TopLeft.X + link.From.BottomRight.X) / 2;
+                float middleY = (link.From.TopLeft.Y + link.From.BottomRight.Y) / 2;
+
+                // Does not touch the bbox
+                if (!(middleX >= spanBbox.X0 && middleX <= spanBbox.X1 &&
+                    middleY >= spanBbox.Y0 && middleY <= spanBbox.Y1))
+                    continue;
+
+                string text = (span.Text ?? "").Trim();
+                if (!string.IsNullOrEmpty(text))
+                {
+                    return $"[{text}]({link.Uri})";
+                }
+            }
+
+            return null;
+        }
+
+        /// <summary>
+        /// Optionally render the rect part of a page.
+        /// We will ignore images that are empty or that have an edge smaller
+        /// than x% of the corresponding page edge.
+        /// </summary>
+        private static string SaveImage(
+            Page page,
+            Rect rect,
+            int imageIndex,
+            bool writeImages,
+            bool embedImages,
+            string imagePath,
+            string imageFormat,
+            string filename,
+            int dpi,
+            float imageSizeLimit)
+        {
+            // Check if image is too small
+            if (rect.Width < page.Rect.Width * imageSizeLimit ||
+                rect.Height < page.Rect.Height * imageSizeLimit)
+            {
+                return "";
+            }
+
+            if (!writeImages && !embedImages)
+                return "";
+
+            Pixmap pix = page.GetPixmap(clip: rect, dpi: dpi);
+            try
+            {
+                if (pix.H <= 0 || pix.W <= 0)
+                    return "";
+
+                if (writeImages)
+                {
+                    // Ensure image path exists
+                    if (!string.IsNullOrEmpty(imagePath) && !Directory.Exists(imagePath))
+                    {
+                        Directory.CreateDirectory(imagePath);
+                    }
+
+                    string safeFilename = Path.GetFileName(filename ?? "document").Replace(" ", "-");
+                    string imageFilename = string.IsNullOrEmpty(imagePath)
+                        ? $"{safeFilename}-{page.Number}-{imageIndex}.{imageFormat}"
+                        : Path.Combine(imagePath, $"{safeFilename}-{page.Number}-{imageIndex}.{imageFormat}");
+                    pix.Save(imageFilename);
+                    return imageFilename.Replace("\\", "/");
+                }
+                else if (embedImages)
+                {
+                    // Make a base64 encoded string of the image
+                    byte[] imageBytes = pix.ToBytes(imageFormat);
+                    string base64 = Convert.ToBase64String(imageBytes);
+                    return $"data:image/{imageFormat};base64,{base64}";
+                }
+            }
+            finally
+            {
+                pix.Dispose();
+            }
+
+            return "";
+        }
+
+        /// <summary>
+        /// Check if page exclusively contains OCR text.
+        /// For this to be true, all text must be written as "ignore-text".
+        /// </summary>
+        private static bool PageIsOcr(Page page)
+        {
+            try
+            {
+                var bboxLog = page.GetBboxlog();
+                var textTypes = new HashSet<string>(bboxLog
+                    .Where(b => b.Type != null && b.Type.Contains("text"))
+                    .Select(b => b.Type)
+                    .Distinct());
+
+                return textTypes.Count == 1 && textTypes.Contains("ignore-text");
+            }
+            catch
+            {
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Get metadata for a page (matches Python get_metadata)
+        /// </summary>
+        private static Dictionary<string, object> GetMetadata(Document doc, int pno, string filename)
+        {
+            var meta = new Dictionary<string, object>();
+            if (doc.MetaData != null)
+            {
+                foreach (var kvp in doc.MetaData)
+                {
+                    meta[kvp.Key] = kvp.Value;
+                }
+            }
+            meta["file_path"] = filename;
+            meta["page_count"] = doc.PageCount;
+            meta["page"] = pno + 1;
+            return meta;
+        }
+
+        /// <summary>
+        /// Reorder words in lines.
+        /// The argument list must be presorted by bottom, then left coordinates.
+        /// Words with similar top / bottom coordinates are assumed to belong to
+        /// the same line and will be sorted left to right within that line.
+        /// </summary>
+        private static List<WordBlock> SortWords(List<WordBlock> words)
+        {
+            if (words == null || words.Count == 0)
+                return new List<WordBlock>();
+
+            List<WordBlock> nwords = new List<WordBlock>();
+            List<WordBlock> line = new List<WordBlock> { words[0] };
+            Rect lrect = new Rect(words[0].X0, words[0].Y0, words[0].X1, words[0].Y1);
+
+            for (int i = 1; i < words.Count; i++)
+            {
+                var word = words[i];
+                var wrect = new Rect(word.X0, word.Y0, word.X1, word.Y1);
+                if (Math.Abs(wrect.Y0 - lrect.Y0) <= 3 || Math.Abs(wrect.Y1 - lrect.Y1) <= 3)
+                {
+                    line.Add(word);
+                    lrect = Utils.JoinRects(new List<Rect> { lrect, wrect });
+                }
+                else
+                {
+                    line = line.OrderBy(w => w.X0).ToList();
+                    nwords.AddRange(line);
+                    line = new List<WordBlock> { word };
+                    lrect = new Rect(word.X0, word.Y0, word.X1, word.Y1);
+                }
+            }
+
+            line = line.OrderBy(w => w.X0).ToList();
+            nwords.AddRange(line);
+            return nwords;
+        }
+
+        /// <summary>
+        /// Output tables above given text rectangle (matches Python output_tables)
+        /// </summary>
+        private static string OutputTables(Parameters parms, Rect textRect, bool extractWords)
+        {
+            StringBuilder thisMd = new StringBuilder(); // Markdown string for table(s) content
+            
+            if (textRect != null) // Select tables above the text block
+            {
+                var tabCandidates = parms.TabRects
+                    .Where(kvp => kvp.Value.Y1 <= textRect.Y0 && !parms.WrittenTables.Contains(kvp.Key) &&
+                        (textRect.X0 <= kvp.Value.X0 && kvp.Value.X0 < textRect.X1 ||
+                         textRect.X0 < kvp.Value.X1 && kvp.Value.X1 <= textRect.X1 ||
+                         kvp.Value.X0 <= textRect.X0 && textRect.X1 <= kvp.Value.X1))
+                    .OrderBy(kvp => kvp.Value.Y1)
+                    .ThenBy(kvp => kvp.Value.X0)
+                    .ToList();
+                
+                foreach (var kvp in tabCandidates)
+                {
+                    int i = kvp.Key;
+                    thisMd.Append("\n" + parms.Tabs[i].ToMarkdown(clean: false) + "\n");
+                    
+                    if (extractWords)
+                    {
+                        // For "words" extraction, add table cells as line rects
+                        var cells = new List<Rect>();
+                        if (parms.Tabs[i].header != null && parms.Tabs[i].header.cells != null)
+                        {
+                            foreach (var c in parms.Tabs[i].header.cells)
+                            {
+                                if (c != null)
+                                    cells.Add(c);
+                            }
+                        }
+                        if (parms.Tabs[i].cells != null)
+                        {
+                            foreach (var c in parms.Tabs[i].cells)
+                            {
+                                if (c != null)
+                                    cells.Add(c);
+                            }
+                        }
+                        cells = cells.Distinct()
+                            .OrderBy(c => c.Y1)
+                            .ThenBy(c => c.X0)
+                            .ToList();
+                        parms.LineRects.AddRange(cells);
+                    }
+                    parms.WrittenTables.Add(i); // Do not touch this table twice
+                }
+            }
+            else // Output all remaining tables
+            {
+                foreach (var kvp in parms.TabRects)
+                {
+                    int i = kvp.Key;
+                    if (parms.WrittenTables.Contains(i))
+                        continue;
+                    
+                    thisMd.Append("\n" + parms.Tabs[i].ToMarkdown(clean: false) + "\n");
+                    
+                    if (extractWords)
+                    {
+                        // For "words" extraction, add table cells as line rects
+                        var cells = new List<Rect>();
+                        if (parms.Tabs[i].header != null && parms.Tabs[i].header.cells != null)
+                        {
+                            foreach (var c in parms.Tabs[i].header.cells)
+                            {
+                                if (c != null)
+                                    cells.Add(c);
+                            }
+                        }
+                        if (parms.Tabs[i].cells != null)
+                        {
+                            foreach (var c in parms.Tabs[i].cells)
+                            {
+                                if (c != null)
+                                    cells.Add(c);
+                            }
+                        }
+                        cells = cells.Distinct()
+                            .OrderBy(c => c.Y1)
+                            .ThenBy(c => c.X0)
+                            .ToList();
+                        parms.LineRects.AddRange(cells);
+                    }
+                    parms.WrittenTables.Add(i); // Do not touch this table twice
+                }
+            }
+            
+            return thisMd.ToString();
+        }
+
+        /// <summary>
+        /// Output images and graphics above text rectangle (matches Python output_images)
+        /// </summary>
+        private static string OutputImages(Parameters parms, Rect textRect, bool forceText, 
+            bool writeImages, bool embedImages, string imagePath, string imageFormat, 
+            string filename, int dpi, float imageSizeLimit, Func<ExtendedSpan, Page, string> getHeaderId)
+        {
+            if (parms.ImgRects == null || parms.ImgRects.Count == 0)
+                return "";
+            
+            StringBuilder thisMd = new StringBuilder(); // Markdown string
+            
+            if (textRect != null) // Select images above the text block
+            {
+                for (int i = 0; i < parms.ImgRects.Count; i++)
+                {
+                    if (parms.WrittenImages.Contains(i))
+                        continue;
+                    
+                    Rect imgRect = parms.ImgRects[i];
+                    if (imgRect.Y0 > textRect.Y0)
+                        continue;
+                    if (imgRect.X0 >= textRect.X1 || imgRect.X1 <= textRect.X0)
+                        continue;
+                    
+                    string pathname = SaveImage(parms.Page, imgRect, i, writeImages, embedImages, 
+                        imagePath, imageFormat, filename, dpi, imageSizeLimit);
+                    parms.WrittenImages.Add(i); // Do not touch this image twice
+                    
+                    if (!string.IsNullOrEmpty(pathname))
+                    {
+                        thisMd.AppendFormat(GRAPHICS_TEXT, pathname);
+                    }
+                    
+                    if (forceText)
+                    {
+                        // Recursive invocation
+                        string imgTxt = WriteText(parms, imgRect, getHeaderId, forceText: true, 
+                            ignoreCode: false, extractWords: false);
+                        if (!Utils.IsWhite(imgTxt)) // Was there text at all?
+                        {
+                            thisMd.Append(imgTxt);
+                        }
+                    }
+                }
+            }
+            else // Output all remaining images
+            {
+                for (int i = 0; i < parms.ImgRects.Count; i++)
+                {
+                    if (parms.WrittenImages.Contains(i))
+                        continue;
+                    
+                    string pathname = SaveImage(parms.Page, parms.ImgRects[i], i, writeImages, embedImages, 
+                        imagePath, imageFormat, filename, dpi, imageSizeLimit);
+                    parms.WrittenImages.Add(i); // Do not touch this image twice
+                    
+                    if (!string.IsNullOrEmpty(pathname))
+                    {
+                        thisMd.AppendFormat(GRAPHICS_TEXT, pathname);
+                    }
+                    
+                    if (forceText)
+                    {
+                        string imgTxt = WriteText(parms, parms.ImgRects[i], getHeaderId, forceText: true, 
+                            ignoreCode: false, extractWords: false);
+                        if (!Utils.IsWhite(imgTxt))
+                        {
+                            thisMd.Append(imgTxt);
+                        }
+                    }
+                }
+            }
+            
+            return thisMd.ToString();
+        }
+
+        /// <summary>
+        /// Output the text found inside the given clip.
+        /// This is an alternative for plain text in that it outputs
+        /// text enriched with markdown styling.
+        /// The logic is capable of recognizing headers, body text, code blocks,
+        /// inline code, bold, italic and bold-italic styling.
+        /// There is also some effort for list supported (ordered / unordered) in
+        /// that typical characters are replaced by respective markdown characters.
+        /// 'tables'/'images' indicate whether this execution should output these
+        /// objects.
+        /// </summary>
+        private static string WriteText(Parameters parms, Rect clip, 
+            Func<ExtendedSpan, Page, string> getHeaderId, bool forceText, bool ignoreCode, bool extractWords)
+        {
+            if (clip == null)
+                clip = parms.Clip;
+            
+            StringBuilder outString = new StringBuilder();
+            
+            // This is a list of tuples (linerect, spanlist)
+            var nlines = GetTextLines.GetRawLines(parms.TextPage, null, clip, tolerance: 3, 
+                ignoreInvisible: !parms.AcceptInvisible);
+            
+            // Filter out lines that intersect with tables
+            nlines = nlines
+                .Where(l => Utils.OutsideAllBboxes(l.Rect, parms.TabRects.Values))
+                .ToList();
+            
+            parms.LineRects.AddRange(nlines.Select(l => l.Rect)); // Store line rectangles
+            
+            Rect prevLrect = null; // Previous line rectangle
+            int prevBno = -1; // Previous block number of line
+            bool code = false; // Mode indicator: outputting code
+            string prevHdrString = null;
+            
+            foreach (var line in nlines)
+            {
+                Rect lrect = line.Rect;
+                var spans = line.Spans;
+                
+                // Skip if line intersects with images
+                if (!Utils.OutsideAllBboxes(lrect, parms.ImgRects))
+                    continue;
+                
+                // Pick up tables ABOVE this text block
+                var tabCandidates = parms.TabRects
+                    .Where(kvp => kvp.Value.Y1 <= lrect.Y0 && !parms.WrittenTables.Contains(kvp.Key) &&
+                        (lrect.X0 <= kvp.Value.X0 && kvp.Value.X0 < lrect.X1 ||
+                         lrect.X0 < kvp.Value.X1 && kvp.Value.X1 <= lrect.X1 ||
+                         kvp.Value.X0 <= lrect.X0 && lrect.X1 <= kvp.Value.X1))
+                    .ToList();
+                
+                foreach (var kvp in tabCandidates)
+                {
+                    int i = kvp.Key;
+                    outString.Append("\n" + parms.Tabs[i].ToMarkdown(clean: false) + "\n");
+                    
+                    if (extractWords)
+                    {
+                        var cells = new List<Rect>();
+                        if (parms.Tabs[i].header != null && parms.Tabs[i].header.cells != null)
+                        {
+                            foreach (var c in parms.Tabs[i].header.cells)
+                            {
+                                if (c != null)
+                                    cells.Add(c);
+                            }
+                        }
+                        if (parms.Tabs[i].cells != null)
+                        {
+                            foreach (var c in parms.Tabs[i].cells)
+                            {
+                                if (c != null)
+                                    cells.Add(c);
+                            }
+                        }
+                        parms.LineRects.AddRange(cells.OrderBy(c => c.Y1).ThenBy(c => c.X0));
+                    }
+                    parms.WrittenTables.Add(i);
+                    prevHdrString = null;
+                }
+                
+                // Pick up images/graphics ABOVE this text block
+                for (int i = 0; i < parms.ImgRects.Count; i++)
+                {
+                    if (parms.WrittenImages.Contains(i))
+                        continue;
+                    
+                    Rect r = parms.ImgRects[i];
+                    if (Math.Max(r.Y0, lrect.Y0) < Math.Min(r.Y1, lrect.Y1) &&
+                        (lrect.X0 <= r.X0 && r.X0 < lrect.X1 ||
+                         lrect.X0 < r.X1 && r.X1 <= lrect.X1 ||
+                         r.X0 <= lrect.X0 && lrect.X1 <= r.X1))
+                    {
+                        string pathname = SaveImage(parms.Page, r, i, false, false, "", "", "", 150, 0.05f);
+                        if (!string.IsNullOrEmpty(pathname))
+                        {
+                            outString.AppendFormat(GRAPHICS_TEXT, pathname);
+                        }
+                        
+                        if (forceText)
+                        {
+                            string imgTxt = WriteText(parms, r, getHeaderId, forceText: true, 
+                                ignoreCode: false, extractWords: false);
+                            if (!Utils.IsWhite(imgTxt))
+                            {
+                                outString.Append(imgTxt);
+                            }
+                        }
+                        parms.WrittenImages.Add(i);
+                        prevHdrString = null;
+                    }
+                }
+                
+                parms.LineRects.Add(lrect);
+                
+                // If line rect is far away from previous one, add line break
+                if (parms.LineRects.Count > 1)
+                {
+                    var prevRect = parms.LineRects[parms.LineRects.Count - 2];
+                    if (lrect.Y1 - prevRect.Y1 > lrect.Height * 1.5f)
+                    {
+                        outString.Append("\n");
+                    }
+                }
+                
+                // Make text string for the full line
+                string text = string.Join(" ", spans.Select(s => s.Text ?? "").Where(t => !string.IsNullOrWhiteSpace(t))).Trim();
+                
+                // Check formatting flags
+                bool allStrikeout = spans.All(s => ((int)s.CharFlags & 1) != 0);
+                bool allItalic = spans.All(s => ((int)s.Flags & 2) != 0);
+                bool allBold = spans.All(s => (((int)s.Flags & 16) != 0) || (((int)s.CharFlags & 8) != 0));
+                bool allMono = spans.All(s => ((int)s.Flags & 8) != 0);
+                
+                // Get header string
+                string hdrString = MaxHeaderId(spans, parms.Page, getHeaderId);
+                
+                if (!string.IsNullOrEmpty(hdrString))
+                {
+                    // Header line
+                    if (allMono)
+                        text = "`" + text + "`";
+                    if (allItalic)
+                        text = "_" + text + "_";
+                    if (allBold)
+                        text = "**" + text + "**";
+                    if (allStrikeout)
+                        text = "~~" + text + "~~";
+                    
+                    if (hdrString != prevHdrString)
+                    {
+                        outString.Append(hdrString + text + "\n");
+                    }
+                    else
+                    {
+                        // Header text broken across multiple lines
+                        while (outString.Length > 0 && outString[outString.Length - 1] == '\n')
+                            outString.Length--;
+                        outString.Append(" " + text + "\n");
+                    }
+                    prevHdrString = hdrString;
+                    continue;
+                }
+                
+                prevHdrString = hdrString;
+                
+                // Start or extend code block
+                if (allMono && !ignoreCode)
+                {
+                    if (!code)
+                    {
+                        outString.Append("```\n");
+                        code = true;
+                    }
+                    float delta = (lrect.X0 - clip.X0) / (spans[0].Size * 0.5f);
+                    string indent = new string(' ', Math.Max(0, (int)delta));
+                    outString.Append(indent + text + "\n");
+                    continue;
+                }
+                
+                if (code && !allMono)
+                {
+                    outString.Append("```\n");
+                    code = false;
+                }
+                
+                ExtendedSpan span0 = spans[0];
+                int bno = span0.Block;
+                if (bno != prevBno)
+                {
+                    outString.Append("\n");
+                    prevBno = bno;
+                }
+                
+                // Check if we need another line break
+                if ((prevLrect != null && lrect.Y1 - prevLrect.Y1 > lrect.Height * 1.5f) ||
+                    (span0.Text != null && (span0.Text.StartsWith("[") || Utils.StartswithBullet(span0.Text))) ||
+                    ((int)span0.Flags & 1) != 0) // Superscript
+                {
+                    outString.Append("\n");
+                }
+                prevLrect = lrect;
+                
+                // Switch off code mode if not all mono
+                if (code)
+                {
+                    outString.Append("```\n");
+                    code = false;
+                }
+                
+                // Process each span
+                foreach (var s in spans)
+                {
+                    bool mono = ((int)s.Flags & 8) != 0;
+                    bool bold = ((int)s.Flags & 16) != 0 || ((int)s.CharFlags & 8) != 0;
+                    bool italic = ((int)s.Flags & 2) != 0;
+                    bool strikeout = ((int)s.CharFlags & 1) != 0;
+                    
+                    string prefix = "";
+                    string suffix = "";
+                    
+                    if (mono)
+                    {
+                        prefix = "`" + prefix;
+                        suffix += "`";
+                    }
+                    if (bold)
+                    {
+                        prefix = "**" + prefix;
+                        suffix += "**";
+                    }
+                    if (italic)
+                    {
+                        prefix = "_" + prefix;
+                        suffix += "_";
+                    }
+                    if (strikeout)
+                    {
+                        prefix = "~~" + prefix;
+                        suffix += "~~";
+                    }
+                    
+                    // Convert intersecting link to markdown syntax
+                    string ltext = ResolveLinks(parms.Links, s);
+                    if (ltext != null)
+                    {
+                        text = hdrString + prefix + ltext + suffix + " ";
+                    }
+                    else
+                    {
+                        text = hdrString + prefix + (s.Text ?? "").Trim() + suffix + " ";
+                    }
+                    
+                    if (Utils.StartswithBullet(text))
+                    {
+                        text = "- " + text.Substring(1);
+                        text = text.Replace("  ", " ");
+                        float dist = span0.Bbox.X0 - clip.X0;
+                        float cwidth = (span0.Bbox.X1 - span0.Bbox.X0) / Math.Max(1, (span0.Text ?? "").Length);
+                        if (cwidth == 0.0f)
+                            cwidth = span0.Size * 0.5f;
+                        int indentCount = (int)Math.Round(dist / cwidth);
+                        text = new string(' ', Math.Max(0, indentCount)) + text;
+                    }
+                    
+                    outString.Append(text);
+                }
+                
+                if (!code)
+                    outString.Append("\n");
+            }
+            
+            outString.Append("\n");
+            if (code)
+            {
+                outString.Append("```\n");
+                code = false;
+            }
+            outString.Append("\n\n");
+            
+            string result = outString.ToString();
+            result = result.Replace(" \n", "\n").Replace("  ", " ");
+            while (result.Contains("\n\n\n"))
+                result = result.Replace("\n\n\n", "\n\n");
+            
+            return result;
+        }
+
+        private static Parameters GetPageOutput(
+            Document doc,
+            int pno,
+            List<float> margins,
+            Func<ExtendedSpan, Page, string> getHeaderId,
+            bool writeImages,
+            bool embedImages,
+            bool ignoreImages,
+            string imagePath,
+            string imageFormat,
+            string filename,
+            bool forceText,
+            int dpi,
+            bool ignoreCode,
+            bool ignoreGraphics,
+            string tableStrategy,
+            bool detectBgColor,
+            int? graphicsLimit,
+            bool ignoreAlpha,
+            bool extractWords,
+            bool pageSeparators,
+            float imageSizeLimit,
+            int textFlags)
+        {
+            Page page = doc[pno];
+            // Remove rotation to ensure we work on rotation=0
+            page.RemoveRotation();
+
+            // Create Parameters object to store page information
+            Parameters parms = new Parameters
+            {
+                Page = page,
+                Filename = filename,
+                MdString = "",
+                Images = new List<object>(),
+                Tables = new List<object>(),
+                Graphics = new List<object>(),
+                Words = new List<object>(),
+                LineRects = new List<Rect>(),
+                AcceptInvisible = PageIsOcr(page) || ignoreAlpha
+            };
+
+            // Determine background color
+            if (detectBgColor)
+            {
+                parms.BgColor = Utils.GetBgColor(page);
+            }
+
+            // Process margins
+            float left = 0, top = 0, right = 0, bottom = 0;
+            if (margins != null && margins.Count > 0)
+            {
+                if (margins.Count == 1)
+                {
+                    left = top = right = bottom = margins[0];
+                }
+                else if (margins.Count == 2)
+                {
+                    top = bottom = margins[0];
+                    left = right = margins[1];
+                }
+                else if (margins.Count == 4)
+                {
+                    left = margins[0];
+                    top = margins[1];
+                    right = margins[2];
+                    bottom = margins[3];
+                }
+            }
+
+            // Set clip with margins: page.rect + (left, top, -right, -bottom)
+            parms.Clip = new Rect(page.Rect);
+            parms.Clip.X0 += left;
+            parms.Clip.Y0 += top;
+            parms.Clip.X1 -= right;
+            parms.Clip.Y1 -= bottom;
+
+            // Extract external links on page
+            parms.Links = page.GetLinks()
+                .Where(l => l.Kind == LinkType.LINK_URI && !string.IsNullOrEmpty(l.Uri))
+                .ToList();
+
+            // Extract annotation rectangles on page
+            try
+            {
+                var annots = page.GetAnnots();
+                parms.AnnotRects = annots
+                    .Where(a => a.Rect != null)
+                    .Select(a => a.Rect)
+                    .ToList();
+            }
+            catch
+            {
+                parms.AnnotRects = new List<Rect>();
+            }
+
+            // Make a TextPage for all later extractions (textFlags passed from ToMarkdown)
+            parms.TextPage = page.GetTextPage(flags: textFlags, clip: parms.Clip);
+
+            // Extract and process tables if not ignoring graphics
+            List<Table> tables = new List<Table>();
+            Dictionary<int, Rect> tabRects = new Dictionary<int, Rect>();
+            List<int> writtenTables = new List<int>();
+
+            if (!ignoreGraphics && !string.IsNullOrEmpty(tableStrategy))
+            {
+                try
+                {
+                    var foundTables = page.GetTables(clip: page.Rect, strategy: tableStrategy);
+                    for (int i = 0; i < foundTables.Count; i++)
+                    {
+                        var t = foundTables[i];
+                        // Remove tables with too few rows or columns
+                        if (t.row_count < 2 || t.col_count < 2)
+                            continue;
+                        tables.Add(t);
+                        // Combine table bbox with header bbox
+                        Rect tabRect = t.bbox;
+                        if (t.header != null && t.header.bbox != null)
+                        {
+                            Rect headerRect = t.header.bbox;
+                            tabRect = Utils.JoinRects(new List<Rect> { tabRect, headerRect });
+                        }
+                        tabRects[tables.Count - 1] = tabRect;
+                    }
+                    // Sort tables by position (top to bottom, left to right)
+                    var sortedIndices = Enumerable.Range(0, tables.Count)
+                        .OrderBy(i => tabRects[i].Y0)
+                        .ThenBy(i => tabRects[i].X0)
+                        .ToList();
+                    var sortedTables = sortedIndices.Select(i => tables[i]).ToList();
+                    var sortedRects = sortedIndices.ToDictionary(
+                        idx => sortedIndices.IndexOf(idx),
+                        idx => tabRects[idx]
+                    );
+                    tables = sortedTables;
+                    tabRects = sortedRects;
+                }
+                catch
+                {
+                    // If table extraction fails, continue without tables
+                }
+            }
+
+
+            // Extract and process images if not ignored
+            List<Rect> imgRects = new List<Rect>();
+            if (!ignoreImages)
+            {
+                try
+                {
+                    List<Block> imgInfo = page.GetImageInfo();
+
+                    // Filter and process images
+                    var validImages = imgInfo
+                        .Where(img => img.Bbox != null)
+                        .Select(img => new { Bbox = new Rect(img.Bbox), Block = img })
+                        .Where(img => 
+                            img.Bbox.Width >= imageSizeLimit * page.Rect.Width &&
+                            img.Bbox.Height >= imageSizeLimit * page.Rect.Height &&
+                            img.Bbox.Intersects(page.Rect) &&
+                            img.Bbox.Width > 3 &&
+                            img.Bbox.Height > 3)
+                        .OrderByDescending(img => Math.Abs(img.Bbox.Width * img.Bbox.Height))
+                        .Take(30) // Limit to 30 largest images
+                        .ToList();
+
+                    // Remove images contained in larger images
+                    for (int i = validImages.Count - 1; i > 0; i--)
+                    {
+                        Rect r = validImages[i].Bbox;
+                        if (r.IsEmpty)
+                        {
+                            validImages.RemoveAt(i);
+                            continue;
+                        }
+                        for (int j = 0; j < i; j++)
+                        {
+                            if (Utils.BboxInBbox(r, validImages[j].Bbox))
+                            {
+                                validImages.RemoveAt(i);
+                                break;
+                            }
+                        }
+                    }
+
+                    parms.ImgRects = validImages.Select(img => img.Bbox).ToList();
+                    parms.Images = validImages.Select(img => (object)img.Block).ToList();
+                }
+                catch
+                {
+                    // If image extraction fails, continue without images
+                }
+            }
+            else
+            {
+                parms.ImgRects = new List<Rect>();
+            }
+
+            // Store tables in parms
+            parms.Tabs = tables;
+            parms.TabRects = tabRects;
+            parms.WrittenTables = writtenTables;
+            parms.TabRects0 = tabRects.Values.ToList();
+
+            // Check graphics limit and set too_many_graphics flag
+            bool tooManyGraphics = false;
+            int graphicsCount = 0;
+            if (!ignoreGraphics && graphicsLimit.HasValue)
+            {
+                try
+                {
+                    var bboxLog = page.GetBboxlog();
+                    graphicsCount = bboxLog.Count(b => b.Type != null && b.Type.Contains("path"));
+                    if (graphicsCount > graphicsLimit.Value)
+                    {
+                        ignoreGraphics = true;
+                        tooManyGraphics = true;
+                    }
+                }
+                catch
+                {
+                    // If bboxlog extraction fails, continue
+                }
+            }
+
+            // Get paths for graphics and multi-column detection
+            List<PathInfo> paths = new List<PathInfo>();
+            List<Rect> vgClusters0 = new List<Rect>();
+            
+            if (!ignoreGraphics)
+            {
+                try
+                {
+                    paths = page.GetDrawings()
+                        .Where(p => p.Rect != null && 
+                               Utils.BboxInBbox(p.Rect, parms.Clip) &&
+                               p.Rect.Width < parms.Clip.Width && 
+                               p.Rect.Height < parms.Clip.Height &&
+                               (p.Rect.Width > 3 || p.Rect.Height > 3) &&
+                               !(p.Type == "f" && p.Fill != null && parms.BgColor != null && 
+                                 p.Fill.Length >= 3 && parms.BgColor.Length >= 3 &&
+                                 Math.Abs(p.Fill[0] - parms.BgColor[0]) < 0.01f &&
+                                 Math.Abs(p.Fill[1] - parms.BgColor[1]) < 0.01f &&
+                                 Math.Abs(p.Fill[2] - parms.BgColor[2]) < 0.01f) &&
+                               Utils.OutsideAllBboxes(p.Rect, parms.TabRects0) &&
+                               Utils.OutsideAllBboxes(p.Rect, parms.AnnotRects))
+                        .ToList();
+
+                    // Cluster drawings
+                    if (paths.Count > 0)
+                    {
+                        var clusters = page.ClusterDrawings(clip: parms.Clip, drawings: paths);
+                        foreach (var bbox in clusters)
+                        {
+                            if (Utils.IsSignificant(bbox, paths))
+                            {
+                                vgClusters0.Add(bbox);
+                            }
+                        }
+
+                        // Get paths that are in significant graphics
+                        parms.ActualPaths = paths
+                            .Where(p => Utils.BboxInAnyBbox(p.Rect, vgClusters0))
+                            .ToList();
+                    }
+                }
+                catch
+                {
+                    paths = new List<PathInfo>();
+                }
+            }
+
+            // Also add image rectangles to the list and vice versa
+            vgClusters0.AddRange(parms.ImgRects);
+            parms.ImgRects.AddRange(vgClusters0);
+            parms.ImgRects = parms.ImgRects
+                .Distinct()
+                .OrderBy(r => r.Y1)
+                .ThenBy(r => r.X0)
+                .ToList();
+            parms.WrittenImages = new List<int>();
+
+            // Refine graphics clusters
+            parms.VgClusters0 = Utils.RefineBoxes(vgClusters0);
+            parms.VgClusters = parms.VgClusters0
+                .Select((r, i) => new { Index = i, Rect = r })
+                .ToDictionary(x => x.Index, x => x.Rect);
+
+            // Calculate character density for text rectangle determination
+            int blockCount = parms.TextPage.ExtractBlocks().Count;
+            float charDensity = blockCount > 0 
+                ? parms.TextPage.ExtractText().Length / (float)blockCount 
+                : 0;
+
+            // Use multi-column detection to get text rectangles
+            List<Rect> textRects;
+            if (tooManyGraphics && charDensity < 20)
+            {
+                // This page has too many isolated text pieces for meaningful layout analysis
+                textRects = new List<Rect> { parms.Clip };
+            }
+            else
+            {
+                try
+                {
+                    textRects = MultiColumn.ColumnBoxes(
+                        page,
+                        footerMargin: bottom,
+                        headerMargin: top,
+                        noImageText: !forceText,
+                        textpage: parms.TextPage,
+                        paths: parms.ActualPaths,
+                        avoid: parms.TabRects0.Concat(parms.VgClusters0).ToList(),
+                        ignoreImages: ignoreImages);
+                    
+                    // If no columns detected, use the full clip
+                    if (textRects == null || textRects.Count == 0)
+                    {
+                        textRects = new List<Rect> { parms.Clip };
+                    }
+                }
+                catch
+                {
+                    // Fallback to full page if column detection fails
+                    textRects = new List<Rect> { parms.Clip };
+                }
+            }
+
+            // Process each text rectangle
+            StringBuilder mdOutput = new StringBuilder();
+            foreach (Rect textRect in textRects)
+            {
+                // Output tables above this text rectangle
+                mdOutput.Append(OutputTables(parms, textRect, extractWords));
+                
+                // Output images above this text rectangle
+                mdOutput.Append(OutputImages(parms, textRect, forceText, writeImages, embedImages, 
+                    imagePath, imageFormat, filename, dpi, imageSizeLimit, getHeaderId));
+                
+                // Output text inside this rectangle
+                mdOutput.Append(WriteText(parms, textRect, getHeaderId, forceText, ignoreCode, extractWords));
+            }
+            
+            // Write any remaining tables and images
+            mdOutput.Append(OutputTables(parms, null, extractWords));
+            mdOutput.Append(OutputImages(parms, null, forceText, writeImages, embedImages, 
+                imagePath, imageFormat, filename, dpi, imageSizeLimit, getHeaderId));
+            
+            // Clean up the output
+            parms.MdString = mdOutput.ToString();
+            parms.MdString = parms.MdString.Replace(" ,", ",").Replace("-\n", "");
+            
+            while (parms.MdString.StartsWith("\n"))
+            {
+                parms.MdString = parms.MdString.Substring(1);
+            }
+            
+            parms.MdString = parms.MdString.Replace('\0', Utils.REPLACEMENT_CHARACTER);
+            
+            // Handle extract_words mode
+            if (extractWords)
+            {
+                var rawWords = parms.TextPage.ExtractWords();
+                rawWords = rawWords.OrderBy(w => w.Y1).ThenBy(w => w.X0).ToList();
+                
+                List<WordBlock> words = new List<WordBlock>();
+                foreach (var lrect in parms.LineRects)
+                {
+                    var lwords = rawWords
+                        .Where(w => 
+                        {
+                            var wrect = new Rect(w.X0, w.Y0, w.X1, w.Y1);
+                            return Utils.BboxInBbox(wrect, lrect);
+                        })
+                        .ToList();
+                    words.AddRange(SortWords(lwords));
+                }
+                
+                // Remove duplicates
+                List<WordBlock> nwords = new List<WordBlock>();
+                foreach (var w in words)
+                {
+                    if (!nwords.Any(nw => nw.X0 == w.X0 && nw.Y0 == w.Y0 && nw.X1 == w.X1 && nw.Y1 == w.Y1 && nw.Text == w.Text))
+                    {
+                        nwords.Add(w);
+                    }
+                }
+                parms.Words = nwords.Cast<object>().ToList();
+            }
+            else
+            {
+                parms.Words = new List<object>();
+            }
+            
+            // Add page separators
+            if (pageSeparators)
+            {
+                parms.MdString += $"\n\n--- end of page={page.Number} ---\n\n";
+            }
+            
+            return parms;
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM/helpers/MultiColumn.cs b/MuPDF.NET4LLM/helpers/MultiColumn.cs
new file mode 100644
index 0000000..ac7cfbb
--- /dev/null
+++ b/MuPDF.NET4LLM/helpers/MultiColumn.cs
@@ -0,0 +1,421 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using MuPDF.NET;
+
+namespace MuPDF.NET4LLM.Helpers
+{
+    /// <summary>
+    /// Multi-column page detection utilities.
+    /// Ported and adapted from the Python module helpers/multi_column.py in pymupdf4llm.
+    /// </summary>
+    public static class MultiColumn
+    {
+        /// <summary>
+        /// Determine bboxes which wrap a column on the page
+        /// </summary>
+        public static List<Rect> ColumnBoxes(
+            Page page,
+            float footerMargin = 50,
+            float headerMargin = 50,
+            bool noImageText = true,
+            TextPage textpage = null,
+            List<PathInfo> paths = null,
+            List<Rect> avoid = null,
+            bool ignoreImages = false)
+        { 
+            // Compute relevant page area
+            Rect clip = new Rect(page.Rect);
+            clip.Y1 -= footerMargin; // Remove footer area
+            clip.Y0 += headerMargin; // Remove header area
+
+            if (paths == null)
+            {
+                paths = page.GetDrawings()
+                    .Where(p => p.Rect.Width < clip.Width && p.Rect.Height < clip.Height)
+                    .ToList();
+            }
+
+            if (textpage == null)
+            {
+                textpage = page.GetTextPage(clip: clip, flags: (int)TextFlags.TEXT_ACCURATE_BBOXES);
+            }
+
+            List<Rect> bboxes = new List<Rect>();
+            List<Rect> imgBboxes = new List<Rect>();
+            if (avoid != null)
+                imgBboxes.AddRange(avoid);
+
+            List<Rect> vertBboxes = new List<Rect>();
+            List<Rect> pathRects = new List<Rect>();
+
+            // Path rectangles
+            foreach (var p in paths)
+            {
+                // Give empty path rectangles some small width or height
+                Rect prect = new Rect(p.Rect);
+                float lwidth = p.Width > 0 ? p.Width * 0.5f : 0.5f;
+
+                if (prect.Width == 0)
+                {
+                    prect.X0 -= lwidth;
+                    prect.X1 += lwidth;
+                }
+                if (prect.Height == 0)
+                {
+                    prect.Y0 -= lwidth;
+                    prect.Y1 += lwidth;
+                }
+                pathRects.Add(prect);
+            }
+
+            // Sort path bboxes by ascending top, then left coordinates
+            pathRects = pathRects.OrderBy(b => (b.Y0, b.X0)).ToList();
+
+            // Bboxes of images on page, no need to sort them
+            if (!ignoreImages)
+            {
+                var images = page.GetImages();
+                foreach (var item in images)
+                {
+                    var boxes = page.GetImageRects(item.Xref);
+                    var rects = boxes.Select(b => b.Rect).ToList();
+                    imgBboxes.AddRange(rects);
+                }
+            }
+
+            // Blocks of text on page
+            PageInfo pageInfo = textpage.ExtractDict(null, false);
+            List<Block> blocks = pageInfo.Blocks;
+
+            // Make block rectangles, ignoring non-horizontal text
+            foreach (var b in blocks)
+            {
+                Rect bbox = new Rect(b.Bbox); // Bbox of the block
+
+                // Ignore text written upon images
+                if (noImageText && Utils.BboxInBbox(bbox, Utils.JoinRects(imgBboxes)))
+                    continue;
+
+                // Confirm first line to be horizontal
+                if (b.Lines == null || b.Lines.Count == 0)
+                    continue;
+
+                Line line0 = b.Lines[0]; // Get first line
+                if (line0.Dir == null || Math.Abs(1 - line0.Dir.X) > 1e-3) // Only (almost) horizontal text
+                {
+                    vertBboxes.Add(bbox); // A block with non-horizontal text
+                    continue;
+                }
+
+                Rect srect = new Rect();
+                foreach (var line in b.Lines)
+                {
+                    Rect lbbox = new Rect(line.Bbox);
+                    string text = string.Join("", line.Spans?.Select(s => s.Text) ?? new string[0]);
+                    if (!Utils.IsWhite(text))
+                    {
+                        srect = Utils.JoinRects(new List<Rect> { srect, lbbox });
+                    }
+                }
+                bbox = srect;
+
+                if (!Utils.BboxIsEmpty(bbox))
+                    bboxes.Add(bbox);
+            }
+
+            // Sort text bboxes by ascending background, top, then left coordinates
+            bboxes = bboxes.OrderBy(k => (InBbox(k, pathRects), k.Y0, k.X0)).ToList();
+
+            // Immediately return if no text found
+            if (bboxes.Count == 0)
+                return new List<Rect>();
+
+            // --------------------------------------------------------------------
+            // Join bboxes to establish some column structure
+            // --------------------------------------------------------------------
+            // The final block bboxes on page
+            List<Rect> nblocks = new List<Rect> { bboxes[0] }; // Pre-fill with first bbox
+            bboxes = bboxes.Skip(1).ToList(); // Remaining old bboxes
+            Dictionary<string, int> cache = new Dictionary<string, int>();
+
+            for (int i = 0; i < bboxes.Count; i++) // Iterate old bboxes
+            {
+                Rect bb = bboxes[i];
+                bool check = false; // Indicates unwanted joins
+
+                // Check if bb can extend one of the new blocks
+                for (int j = 0; j < nblocks.Count; j++)
+                {
+                    Rect nbb = nblocks[j]; // A new block
+
+                    // Never join across columns
+                    if (nbb.X1 < bb.X0 || bb.X1 < nbb.X0)
+                        continue;
+
+                    // Never join across different background colors
+                    if (InBboxUsingCache(nbb, pathRects, cache) != InBboxUsingCache(bb, pathRects, cache))
+                        continue;
+
+                    Rect temp = Utils.JoinRects(new List<Rect> { bb, nbb }); // Temporary extension of new block
+                    check = CanExtend(temp, nbb, nblocks, vertBboxes);
+                    if (check is true)
+                    {
+                        break;
+                    }
+                }
+
+                if (!check) // Bb cannot be used to extend any of the new bboxes
+                {
+                    nblocks.Add(bb); // So add it to the list
+                    int j = nblocks.Count - 1; // Index of it
+                    Rect temp = nblocks[j]; // New bbox added
+
+                    // Check if some remaining bbox is contained in temp
+                    check = CanExtend(temp, bb, bboxes, vertBboxes);
+                    if (check is false)
+                    {
+                        nblocks.Add(bb);
+                    }
+                    else
+                    {
+                        nblocks[j] = temp;
+                    }
+                    bboxes[i] = null;
+                }
+            }
+
+            // Do some elementary cleaning
+            nblocks = CleanNblocks(nblocks);
+            if (nblocks.Count == 0)
+                return nblocks;
+
+            // Several phases of rectangle joining
+            // TODO: disabled for now as too aggressive:
+            // nblocks = JoinRectsPhase1(nblocks);
+            nblocks = JoinRectsPhase2(nblocks);
+            nblocks = JoinRectsPhase3(nblocks, pathRects, cache);
+
+            // Return identified text bboxes
+
+            if (textpage != null && textpage != page.GetTextPage())
+                textpage.Dispose();
+
+            return nblocks;
+        }
+
+        private static int InBbox(Rect bb, List<Rect> bboxes)
+        {
+            for (int i = 0; i < bboxes.Count; i++)
+            {
+                if (Utils.BboxInBbox(bb, bboxes[i]))
+                    return i + 1;
+            }
+            return 0;
+        }
+
+        private static int InBboxUsingCache(Rect bb, List<Rect> bboxes, Dictionary<string, int> cache)
+        {
+            string cacheKey = $"{bb.GetHashCode()}_{bboxes.GetHashCode()}";
+            if (cache.TryGetValue(cacheKey, out int cached))
+                return cached;
+
+            int index = InBbox(bb, bboxes);
+            cache[cacheKey] = index;
+            return index;
+        }
+
+        private static bool IntersectsBboxes(Rect bb, List<Rect> bboxes)
+        {
+            return bboxes.Any(bbox => !Utils.OutsideBbox(bb, bbox, strict: true));
+        }
+
+        private static bool CanExtend(Rect temp, Rect bb, List<Rect> bboxlist, List<Rect> vertBboxes)
+        {
+            foreach (var b in bboxlist)
+            {
+                if (!IntersectsBboxes(temp, vertBboxes) &&
+                    (b == null || b == bb || Utils.BboxIsEmpty(Utils.IntersectRects(temp, b))))
+                    continue;
+                return false;
+            }
+            return true;
+        }
+
+        private static List<Rect> CleanNblocks(List<Rect> nblocks)
+        {
+            // 1. Remove any duplicate blocks.
+            if (nblocks.Count < 2)
+                return nblocks;
+
+            for (int i = nblocks.Count - 1; i > 0; i--)
+            {
+                if (nblocks[i].EqualTo(nblocks[i - 1]))
+                    nblocks.RemoveAt(i);
+            }
+
+            if (nblocks.Count == 0)
+                return nblocks;
+
+            // 2. Repair sequence in special cases:
+            // Consecutive bboxes with almost same bottom value are sorted ascending
+            // by x-coordinate.
+            float y1 = nblocks[0].Y1; // First bottom coordinate
+            int i0 = 0; // Its index
+            int i1 = 0; // Index of last bbox with same bottom
+
+            // Iterate over bboxes, identifying segments with approx. same bottom value.
+            // Replace every segment by its sorted version.
+
+            for (int i = 1; i < nblocks.Count; i++)
+            {
+                Rect b1 = nblocks[i];
+                if (Math.Abs(b1.Y1 - y1) > 3) // Different bottom
+                {
+                    if (i1 > i0) // Segment length > 1? Sort it!
+                    {
+                        var segment = nblocks.Skip(i0).Take(i1 - i0 + 1).OrderBy(b => b.X0).ToList();
+                        for (int j = 0; j < segment.Count; j++)
+                            nblocks[i0 + j] = segment[j];
+                    }
+                    y1 = b1.Y1; // Store new bottom value
+                    i0 = i; // Store its start index
+                }
+                i1 = i; // Store current index
+            }
+            if (i1 > i0) // Segment waiting to be sorted
+            {
+                var segment = nblocks.Skip(i0).Take(i1 - i0 + 1).OrderBy(b => b.X0).ToList();
+                for (int j = 0; j < segment.Count; j++)
+                    nblocks[i0 + j] = segment[j];
+            }
+
+            return nblocks;
+        }
+
+        private static List<Rect> JoinRectsPhase2(List<Rect> bboxes)
+        {
+            // Postprocess identified text blocks, phase 2.
+            // Increase the width of each text block so that small left or right
+            // border differences are removed. Then try to join even more text
+            // rectangles.
+            List<Rect> prects = bboxes.Select(b => new Rect(b)).ToList(); // Copy of argument list
+
+            for (int i = 0; i < prects.Count; i++)
+            {
+                Rect b = prects[i];
+                // Go left and right somewhat
+                float x0 = prects.Where(bb => Math.Abs(bb.X0 - b.X0) <= 3).Min(bb => bb.X0);
+                float x1 = prects.Where(bb => Math.Abs(bb.X1 - b.X1) <= 3).Max(bb => bb.X1);
+                b.X0 = x0; // Store new left / right border
+                b.X1 = x1;
+                prects[i] = b;
+            }
+
+            // Sort by left, top
+            prects = prects.OrderBy(b => (b.X0, b.Y0)).ToList();
+            List<Rect> newRects = new List<Rect> { prects[0] }; // Initialize with first item
+
+            // Walk through the rest, top to bottom, then left to right
+            for (int i = 1; i < prects.Count; i++)
+            {
+                Rect r = prects[i];
+                Rect r0 = newRects[newRects.Count - 1]; // Previous bbox
+
+                // Join if we have similar borders and are not too far down
+                if (Math.Abs(r.X0 - r0.X0) <= 3 &&
+                    Math.Abs(r.X1 - r0.X1) <= 3 &&
+                    Math.Abs(r0.Y1 - r.Y0) <= 10)
+                {
+                    r0 = Utils.JoinRects(new List<Rect> { r0, r });
+                    newRects[newRects.Count - 1] = r0;
+                    continue;
+                }
+                // Else append this as new text block
+                newRects.Add(r);
+            }
+            return newRects;
+        }
+
+        private static List<Rect> JoinRectsPhase3(List<Rect> bboxes, List<Rect> pathRects, Dictionary<string, int> cache)
+        {
+            List<Rect> prects = bboxes.Select(b => new Rect(b)).ToList();
+            List<Rect> newRects = new List<Rect>();
+
+            while (prects.Count > 0)
+            {
+                Rect prect0 = prects[0];
+                bool repeat = true;
+                while (repeat)
+                {
+                    repeat = false;
+                    for (int i = prects.Count - 1; i > 0; i--)
+                    {
+                        Rect prect1 = prects[i];
+                        // Do not join across columns
+                        if (prect1.X0 > prect0.X1 || prect1.X1 < prect0.X0)
+                            continue;
+
+                        // Do not join different backgrounds
+                        if (InBboxUsingCache(prect0, pathRects, cache) != InBboxUsingCache(prect1, pathRects, cache))
+                            continue;
+
+                        Rect temp = Utils.JoinRects(new List<Rect> { prect0, prect1 });
+                        var intersecting = prects.Concat(newRects).Where(b => b.Intersects(temp)).ToList();
+                        if (intersecting.Count == 2 && intersecting.Contains(prect0) && intersecting.Contains(prect1))
+                        {
+                            prect0 = temp;
+                            prects[0] = prect0;
+                            prects.RemoveAt(i);
+                            repeat = true;
+                        }
+                    }
+                }
+                newRects.Add(prect0);
+                prects.RemoveAt(0);
+            }
+
+            // Hopefully the most reasonable sorting sequence:
+            // At this point we have finished identifying blocks that wrap text.
+            // We now need to determine the SEQUENCE by which text extraction from
+            // these blocks should take place. This is hardly possible with 100%
+            // certainty. Our sorting approach is guided by the following thought:
+            // 1. Extraction should start with the block whose top-left corner is the
+            //    left-most and top-most.
+            // 2. Any blocks further to the right should be extracted later - even if
+            //    their top-left corner is higher up on the page.
+            // 3. Sorting the identified rectangles must therefore happen using a
+            //    tuple (y, x) as key, where y is not smaller (= higher up) than that
+            //    of the left-most block with a non-empty vertical overlap.
+            // 4. To continue "left block" with "next is ...", its sort key must be
+            //    tuple (P.y, Q.x).
+            var sortRects = newRects.Select(box =>
+            {
+                // Search for the left-most rect that overlaps like "P" above
+                // Candidates must have the same background
+                int background = InBbox(box, pathRects); // This background
+                var leftRects = newRects
+                    .Where(r => r.X1 < box.X0 &&
+                                (box.Y0 <= r.Y0 && r.Y0 <= box.Y1 || box.Y0 <= r.Y1 && r.Y1 <= box.Y1))
+                    .OrderBy(r => r.X1)
+                    .ToList();
+
+                (float y, float x) key;
+                if (leftRects.Count > 0) // If a "P" rectangle was found ...
+                {
+                    key = (leftRects[leftRects.Count - 1].Y0, box.X0); // Use this key
+                }
+                else
+                {
+                    key = (box.Y0, box.X0); // Else use the original (Q.y, Q.x).
+                }
+                return (box, key);
+            })
+            .OrderBy(sr => sr.key) // By computed key
+            .Select(sr => sr.box) // Extract sorted rectangles
+            .ToList();
+
+            return sortRects;
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM/helpers/Progress.cs b/MuPDF.NET4LLM/helpers/Progress.cs
new file mode 100644
index 0000000..18caf67
--- /dev/null
+++ b/MuPDF.NET4LLM/helpers/Progress.cs
@@ -0,0 +1,91 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace MuPDF.NET4LLM.Helpers
+{
+    /// <summary>
+    /// Text-based progress bar to allow watching the advancement
+    /// of Markdown conversion of document pages.
+    /// Ported and adapted from the Python helpers/progress.py in pymupdf4llm.
+    /// 
+    /// Copyright and License
+    /// Copyright 2024 Artifex Software, Inc.
+    /// License GNU Affero GPL 3.0
+    /// </summary>
+    public class _ProgressBar : IEnumerator<object>
+    {
+        private readonly List<object> _items;
+        private readonly int _progressWidth;
+        private readonly int _lenDigits;
+        private float _progressBarValue;
+        private int _currentIndex;
+        private IEnumerator<object> _enumerator;
+
+        public _ProgressBar(List<object> items, int progressWidth = 40)
+        {
+            _items = items;
+            _progressWidth = progressWidth;
+            _lenDigits = items.Count.ToString().Length;
+            _progressBarValue = 0;
+            _currentIndex = -1; // Start at -1 for initial MoveNext to work
+            _enumerator = items.GetEnumerator();
+
+            // Calculate the increment for each item based on the list length and the progress width
+            // Init progress bar
+            Console.Write($"[{new string(' ', _progressWidth)}] (0/{_items.Count})");
+            Console.Out.Flush();
+            Console.Write($"\b{_progressWidth + _lenDigits + 6}");
+        }
+
+        public object Current => _enumerator.Current;
+
+        public bool MoveNext()
+        {
+            if (!_enumerator.MoveNext())
+            {
+                // End progress on StopIteration
+                Console.WriteLine("]\n");
+                return false;
+            }
+
+            // Update the current index
+            _currentIndex++;
+
+            // Add the increment to the progress bar and calculate how many "=" to add
+            _progressBarValue += (float)_progressWidth / _items.Count;
+
+            int filledLength = (int)(_currentIndex * (float)_progressWidth / _items.Count);
+            // Update the numerical progress
+            string paddedIndex = (_currentIndex + 1).ToString().PadLeft(_lenDigits);
+            string progressInfo = $" ({paddedIndex}/{_items.Count})";
+
+            Console.Write($"\r[{new string('=', filledLength)}{new string(' ', _progressWidth - filledLength)}]");
+            Console.Write(progressInfo);
+            Console.Out.Flush();
+
+            return true;
+        }
+
+        public void Reset()
+        {
+            _currentIndex = -1;
+            _progressBarValue = 0;
+            _enumerator.Reset();
+        }
+
+        public void Dispose()
+        {
+            _enumerator?.Dispose();
+        }
+    }
+
+    public static class ProgressBar
+    {
+        public static IEnumerator<object> Create(List<object> list, int progressWidth = 40)
+        {
+            return new _ProgressBar(list, progressWidth);
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM/helpers/Utils.cs b/MuPDF.NET4LLM/helpers/Utils.cs
new file mode 100644
index 0000000..385c55e
--- /dev/null
+++ b/MuPDF.NET4LLM/helpers/Utils.cs
@@ -0,0 +1,669 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using MuPDF.NET;
+using mupdf;
+
+namespace MuPDF.NET4LLM.Helpers
+{
+    /// <summary>
+    /// Utility functions for PDF processing and layout analysis.
+    /// Ported and adapted from the Python module helpers/utils.py in pymupdf4llm.
+    /// </summary>
+    public static class Utils
+    {
+        // Constants
+        public static readonly HashSet<char> WHITE_CHARS = new HashSet<char>(
+            Enumerable.Range(0, 33).Select(i => (char)i)
+            .Concat(new[]
+            {
+                '\u00a0',  // Non-breaking space
+                '\u2000',  // En quad
+                '\u2001',  // Em quad
+                '\u2002',  // En space
+                '\u2003',  // Em space
+                '\u2004',  // Three-per-em space
+                '\u2005',  // Four-per-em space
+                '\u2006',  // Six-per-em space
+                '\u2007',  // Figure space
+                '\u2008',  // Punctuation space
+                '\u2009',  // Thin space
+                '\u200a',  // Hair space
+                '\u202f',  // Narrow no-break space
+                '\u205f',  // Medium mathematical space
+                '\u3000',  // Ideographic space
+            })
+        );
+
+        public const char REPLACEMENT_CHARACTER = '\uFFFD';
+        public const string TYPE3_FONT_NAME = "Unnamed-T3";
+
+        public static readonly HashSet<char> BULLETS = new HashSet<char>(
+            new[]
+            {
+                '\u002A',  // *
+                '\u002D',  // -
+                '\u003E',  // >
+                '\u006F',  // o
+                '\u00B6',  // ¶
+                '\u00B7',  // ·
+                '\u2010',  // ‐
+                '\u2011',  // ‑
+                '\u2012',  // ‒
+                '\u2013',  // –
+                '\u2014',  // —
+                '\u2015',  // ―
+                '\u2020',  // †
+                '\u2021',  // ‡
+                '\u2022',  // •
+                '\u2212',  // −
+                '\u2219',  // ∙
+                '\uF0A7',  // Private use
+                '\uF0B7',  // Private use
+                REPLACEMENT_CHARACTER,
+            }
+            .Concat(Enumerable.Range(0x25A0, 0x2600 - 0x25A0).Select(i => (char)i))
+        );
+
+        public static int FLAGS = (int)(
+            mupdf.mupdf.FZ_STEXT_COLLECT_STYLES |
+            mupdf.mupdf.FZ_STEXT_COLLECT_VECTORS |
+            (int)TextFlags.TEXT_PRESERVE_IMAGES |
+            (int)TextFlags.TEXT_ACCURATE_BBOXES |
+            (int)TextFlags.TEXT_MEDIABOX_CLIP
+        );
+
+        /// <summary>
+        /// Traverse /AcroForm/Fields hierarchy and return a dict:
+        /// fully qualified field name -> {"value": ..., "pages": [...]}
+        /// Optionally, the xref of the field is included.
+        /// </summary>
+        public static Dictionary<string, Dictionary<string, object>> ExtractFormFieldsWithPages(Document doc, bool xrefs = false)
+        {
+            // Access the AcroForm dictionary.
+            // Fast exit if not present or empty.
+            // Placeholder - would need to access PDF internals
+            return new Dictionary<string, Dictionary<string, object>>();
+        }
+
+        /// <summary>
+        /// Normalize a folder path ("" = script folder), ensure it exists,
+        /// and return a Markdown-safe file reference using forward slashes.
+        /// Prefers relative paths to avoid Windows drive-letter issues.
+        /// </summary>
+        public static (string mdRef, string actualPath) MdPath(string folder, string filename)
+        {
+            // 1. Use current working directory as script dir.
+            string scriptDir = Directory.GetCurrentDirectory();
+            string basePath;
+
+            if (string.IsNullOrWhiteSpace(folder))
+            {
+                basePath = scriptDir;
+            }
+            else
+            {
+                basePath = Environment.ExpandEnvironmentVariables(folder);
+                basePath = Path.GetFullPath(basePath);
+            }
+
+            // 2. Create folder if it doesn't exist
+            Directory.CreateDirectory(basePath);
+
+            // 3. Build full file path
+            string fullPath = Path.Combine(basePath, Path.GetFileName(filename));
+            string mdRef;
+
+            // 4. Try to compute a relative path (best for Markdown)
+            // Calculate relative path manually for compatibility with .NET Standard 2.0
+            // Path.GetRelativePath is only available in .NET Core 2.1+ and .NET Standard 2.1+
+            if (fullPath.StartsWith(scriptDir, StringComparison.OrdinalIgnoreCase))
+            {
+                string relative = fullPath.Substring(scriptDir.Length).TrimStart(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar);
+                mdRef = relative.Replace("\\", "/");
+                if (!string.IsNullOrEmpty(mdRef) && !mdRef.StartsWith("."))
+                    mdRef = "./" + mdRef;
+            }
+            else
+            {
+                // Not relative → fall back to POSIX path
+                mdRef = fullPath.Replace("\\", "/");
+            }
+            // 5. Escape Markdown-sensitive characters
+            // Escaping bracket is for MD references only, not for actual file saving.
+            // The first item is the MD-safe form,
+            // the second is the actual path to use in pixmap saving.
+            mdRef = mdRef.Replace("(", "-").Replace(")", "-")
+                         .Replace("[", "-").Replace("]", "-");
+
+            return (mdRef, fullPath);
+        }
+
+        /// <summary>
+        /// Check if text starts with a bullet character
+        /// </summary>
+        public static bool StartswithBullet(string text)
+        {
+            if (string.IsNullOrEmpty(text))
+                return false;
+            if (!BULLETS.Contains(text[0]))
+                return false;
+            if (text.Length == 1)
+                return true;
+            if (text[1] == ' ')
+                return true;
+            return false;
+        }
+
+        /// <summary>
+        /// Identify white text
+        /// </summary>
+        public static bool IsWhite(string text)
+        {
+            if (string.IsNullOrEmpty(text))
+                return true;
+            return text.All(c => WHITE_CHARS.Contains(c));
+        }
+
+        /// <summary>
+        /// Check if bounding box is empty
+        /// </summary>
+        public static bool BboxIsEmpty(Rect bbox)
+        {
+            if (bbox == null)
+                return true;
+            return bbox.X0 >= bbox.X1 || bbox.Y0 >= bbox.Y1;
+        }
+
+        /// <summary>
+        /// Intersect two rectangles
+        /// </summary>
+        public static Rect IntersectRects(Rect r1, Rect r2, bool bboxOnly = false)
+        {
+            if (r1 == null || r2 == null)
+                return new Rect();
+            
+            float x0 = Math.Max(r1.X0, r2.X0);
+            float y0 = Math.Max(r1.Y0, r2.Y0);
+            float x1 = Math.Min(r1.X1, r2.X1);
+            float y1 = Math.Min(r1.Y1, r2.Y1);
+            
+            if (x0 >= x1 || y0 >= y1)
+                return new Rect();
+            
+            return new Rect(x0, y0, x1, y1);
+        }
+
+        /// <summary>
+        /// Join a list of rectangles into their bounding rectangle
+        /// </summary>
+        public static Rect JoinRects(List<Rect> rects, bool bboxOnly = false)
+        {
+            if (rects == null || rects.Count == 0)
+                return new Rect();
+            
+            float x0 = rects.Min(r => r.X0);
+            float y0 = rects.Min(r => r.Y0);
+            float x1 = rects.Max(r => r.X1);
+            float y1 = rects.Max(r => r.Y1);
+            
+            return new Rect(x0, y0, x1, y1);
+        }
+
+        /// <summary>
+        /// Check if bbox is almost entirely within clip
+        /// </summary>
+        public static bool AlmostInBbox(Rect bbox, Rect clip, float portion = 0.8f)
+        {
+            if (bbox == null || clip == null)
+                return false;
+            
+            float x0 = Math.Max(bbox.X0, clip.X0);
+            float y0 = Math.Max(bbox.Y0, clip.Y0);
+            float x1 = Math.Min(bbox.X1, clip.X1);
+            float y1 = Math.Min(bbox.Y1, clip.Y1);
+            
+            float interArea = Math.Max(0, x1 - x0) * Math.Max(0, y1 - y0);
+            float boxArea = (bbox.X1 - bbox.X0) * (bbox.Y1 - bbox.Y0);
+            
+            // If intersection area is greater than portion of box area
+            return interArea > boxArea * portion;
+        }
+
+        /// <summary>
+        /// Check if bbox is outside cell
+        /// </summary>
+        public static bool OutsideBbox(Rect bbox, Rect cell, bool strict = false)
+        {
+            if (bbox == null || cell == null)
+                return true;
+            
+            if (!strict)
+            {
+                return bbox.X0 >= cell.X1 || bbox.X1 <= cell.X0 ||
+                       bbox.Y0 >= cell.Y1 || bbox.Y1 <= cell.Y0;
+            }
+            else
+            {
+                return bbox.X0 > cell.X1 || bbox.X1 < cell.X0 ||
+                       bbox.Y0 > cell.Y1 || bbox.Y1 < cell.Y0;
+            }
+        }
+
+        /// <summary>
+        /// Check if inner rectangle is contained within outer rectangle
+        /// </summary>
+        public static bool BboxInBbox(Rect inner, Rect outer)
+        {
+            if (inner == null || outer == null)
+                return false;
+            
+            return outer.X0 <= inner.X0 && outer.Y0 <= inner.Y0 &&
+                   outer.X1 >= inner.X1 && outer.Y1 >= inner.Y1;
+        }
+
+        /// <summary>
+        /// Check if rect is contained in any rect of the list
+        /// </summary>
+        public static bool BboxInAnyBbox(Rect rect, IEnumerable<Rect> rectList)
+        {
+            if (rect == null || rectList == null)
+                return false;
+            
+            return rectList.Any(r => BboxInBbox(rect, r));
+        }
+
+        /// <summary>
+        /// Check if rect is outside all rects in the list
+        /// </summary>
+        public static bool OutsideAllBboxes(Rect rect, IEnumerable<Rect> rectList)
+        {
+            if (rect == null || rectList == null)
+                return true;
+            
+            return rectList.All(r => OutsideBbox(rect, r));
+        }
+
+        /// <summary>
+        /// Check if middle of rect is contained in any rect of the list
+        /// </summary>
+        public static bool AlmostInAnyBbox(Rect rect, IEnumerable<Rect> rectList, float portion = 0.5f)
+        {
+            if (rect == null || rectList == null)
+                return false;
+            
+            // Enlarge rect slightly
+            Rect enlarged = new Rect(
+                rect.X0 - 1,
+                rect.Y0 - 1,
+                rect.X1 + 1,
+                rect.Y1 + 1
+            );
+            
+            return rectList.Any(r => AlmostInBbox(enlarged, r, portion));
+        }
+
+        /// <summary>
+        /// Join any rectangles with a pairwise non-empty overlap.
+        /// Accepts and returns a list of Rect items.
+        /// Note that rectangles that only "touch" each other (common point or edge)
+        /// are not considered as overlapping.
+        /// Use a positive "enlarge" parameter to enlarge rectangle by these many
+        /// points in every direction.
+        /// TODO: Consider using a sweeping line algorithm for this.
+        /// </summary>
+        public static List<Rect> RefineBoxes(List<Rect> boxes, float enlarge = 0)
+        {
+            if (boxes == null || boxes.Count == 0)
+                return new List<Rect>();
+
+            List<Rect> newRects = new List<Rect>();
+            // List of all vector graphic rectangles
+            List<Rect> prects = boxes.Select(b => new Rect(b)).ToList();
+
+            while (prects.Count > 0) // The algorithm will empty this list
+            {
+                Rect r = new Rect(prects[0]); // Copy of first rectangle
+                r.X0 -= enlarge;
+                r.Y0 -= enlarge;
+                r.X1 += enlarge;
+                r.Y1 += enlarge;
+
+                bool repeat = true; // Initialize condition
+                while (repeat)
+                {
+                    repeat = false; // Set false as default
+                    for (int i = prects.Count - 1; i > 0; i--) // From back to front
+                    {
+                        if (r.Intersects(prects[i])) // Enlarge first rect with this
+                        {
+                            r = Utils.JoinRects(new List<Rect> { r, prects[i] });
+                            prects.RemoveAt(i); // Delete this rect
+                            repeat = true; // Indicate must try again
+                        }
+                    }
+                }
+
+                // First rect now includes all overlaps
+                newRects.Add(r);
+                prects.RemoveAt(0);
+            }
+
+            return newRects
+                .OrderBy(r => r.X0)
+                .ThenBy(r => r.Y0)
+                .ToList(); // Sort by left, top
+        }
+
+        /// <summary>
+        /// Determine the background color of the page
+        /// </summary>
+        public static float[] GetBgColor(Page page)
+        {
+            if (page == null)
+                return null;
+
+            try
+            {
+                // Check upper left corner
+                Rect ulRect = new Rect(page.Rect.X0, page.Rect.Y0, page.Rect.X0 + 10, page.Rect.Y0 + 10);
+                Pixmap pixUL = page.GetPixmap(clip: ulRect);
+                if (pixUL == null || pixUL.SAMPLES == null || !pixUL.IsUniColor)
+                {
+                    pixUL?.Dispose();
+                    return null;
+                }
+                var pixelUL = pixUL.GetPixel(0, 0);
+                pixUL.Dispose();
+
+                // Check upper right corner
+                Rect urRect = new Rect(page.Rect.X1 - 10, page.Rect.Y0, page.Rect.X1, page.Rect.Y0 + 10);
+                Pixmap pixUR = page.GetPixmap(clip: urRect);
+                if (pixUR == null || pixUR.SAMPLES == null || !pixUR.IsUniColor)
+                {
+                    pixUR?.Dispose();
+                    return null;
+                }
+                var pixelUR = pixUR.GetPixel(0, 0);
+                pixUR.Dispose();
+
+                if (pixelUL.Length != pixelUR.Length || 
+                    !pixelUL.SequenceEqual(pixelUR))
+                    return null;
+
+                // Check lower left corner
+                Rect llRect = new Rect(page.Rect.X0, page.Rect.Y1 - 10, page.Rect.X0 + 10, page.Rect.Y1);
+                Pixmap pixLL = page.GetPixmap(clip: llRect);
+                if (pixLL == null || pixLL.SAMPLES == null || !pixLL.IsUniColor)
+                {
+                    pixLL?.Dispose();
+                    return null;
+                }
+                var pixelLL = pixLL.GetPixel(0, 0);
+                pixLL.Dispose();
+
+                if (pixelUL.Length != pixelLL.Length || 
+                    !pixelUL.SequenceEqual(pixelLL))
+                    return null;
+
+                // Check lower right corner
+                Rect lrRect = new Rect(page.Rect.X1 - 10, page.Rect.Y1 - 10, page.Rect.X1, page.Rect.Y1);
+                Pixmap pixLR = page.GetPixmap(clip: lrRect);
+                if (pixLR == null || pixLR.SAMPLES == null || !pixLR.IsUniColor)
+                {
+                    pixLR?.Dispose();
+                    return null;
+                }
+                var pixelLR = pixLR.GetPixel(0, 0);
+                pixLR.Dispose();
+
+                if (pixelUL.Length != pixelLR.Length || 
+                    !pixelUL.SequenceEqual(pixelLR))
+                    return null;
+
+                // All corners match - return normalized RGB
+                if (pixelUL.Length >= 3)
+                {
+                    return new float[] 
+                    { 
+                        pixelUL[0] / 255f, 
+                        pixelUL[1] / 255f, 
+                        pixelUL[2] / 255f 
+                    };
+                }
+            }
+            catch
+            {
+                // If background detection fails, return null
+            }
+
+            return null;
+        }
+
+        /// <summary>
+        /// Check whether the rectangle contains significant drawings
+        /// </summary>
+        public static bool IsSignificant(Rect box, List<PathInfo> paths)
+        {
+            if (box == null || paths == null || paths.Count == 0)
+                return false;
+
+            // Build a sub-box of 90% of the original box
+            // To this end, we build a sub-box of 90% of the original box and check
+            // whether this still contains drawing paths.
+            float d;
+            if (box.Width > box.Height)
+                d = box.Width * 0.025f;
+            else
+                d = box.Height * 0.025f;
+
+            Rect nbox = new Rect(
+                box.X0 + d,
+                box.Y0 + d,
+                box.X1 - d,
+                box.Y1 - d
+            ); // Nbox covers 90% of box interior
+
+            // Paths contained in, but not equal to box
+            var myPaths = paths
+                .Where(p => p.Rect != null && 
+                       BboxInBbox(p.Rect, box) && 
+                       !p.Rect.EqualTo(box))
+                .ToList();
+
+            if (myPaths.Count == 0)
+                return false;
+
+            // Check if all paths are horizontal or vertical lines
+            var widths = myPaths.Select(p => (int)Math.Round(p.Rect.Width))
+                .Concat(new[] { (int)Math.Round(box.Width) })
+                .Distinct()
+                .ToList();
+            var heights = myPaths.Select(p => (int)Math.Round(p.Rect.Height))
+                .Concat(new[] { (int)Math.Round(box.Height) })
+                .Distinct()
+                .ToList();
+
+            if (widths.Count == 1 || heights.Count == 1)
+                return false; // All paths are horizontal or vertical lines / rectangles
+
+            // Check if any path intersects the interior
+            foreach (var p in myPaths)
+            {
+                Rect rect = p.Rect;
+                if (!(
+                    BboxIsEmpty(rect) || BboxIsEmpty(IntersectRects(rect, nbox))
+                )) // Intersects interior: significant!
+                {
+                    return true;
+                }
+                // Remaining case: a horizontal or vertical line
+                // Horizontal line:
+                if (
+                    true
+                    && Math.Abs(rect.Y0 - rect.Y1) < 0.1f
+                    && nbox.Y0 <= rect.Y0 && rect.Y0 <= nbox.Y1
+                    && rect.X0 < nbox.X1
+                    && rect.X1 > nbox.X0
+                )
+                {
+                    return true;
+                }
+                // Vertical line
+                if (
+                    true
+                    && Math.Abs(rect.X0 - rect.X1) < 0.1f
+                    && nbox.X0 <= rect.X0 && rect.X0 <= nbox.X1
+                    && rect.Y0 < nbox.Y1
+                    && rect.Y1 > nbox.Y0
+                )
+                {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        /// <summary>
+        /// Expand bbox to include all points
+        /// </summary>
+        public static (float x0, float y0, float x1, float y1) ExpandBboxByPoints(
+            (float x0, float y0, float x1, float y1) bbox,
+            List<Point> points)
+        {
+            if (points == null || points.Count == 0)
+                return bbox;
+            
+            float x0 = Math.Min(points.Min(p => p.X), bbox.x0);
+            float y0 = Math.Min(points.Min(p => p.Y), bbox.y0);
+            float x1 = Math.Max(points.Max(p => p.X), bbox.x1);
+            float y1 = Math.Max(points.Max(p => p.Y), bbox.y1);
+            
+            return (x0, y0, x1, y1);
+        }
+
+        /// <summary>
+        /// Analyze the page for OCR decision
+        /// </summary>
+        public static Dictionary<string, object> AnalyzePage(Page page, List<Block> blocks = null)
+        {
+            int charsTotal = 0;
+            int charsBad = 0;
+            
+            if (blocks == null)
+            {
+                TextPage textPage = page.GetTextPage(
+                    clip: new Rect(float.NegativeInfinity, float.NegativeInfinity, 
+                                   float.PositiveInfinity, float.PositiveInfinity),
+                    flags: FLAGS);
+                PageInfo pageInfo = textPage.ExtractDict(null, false);
+                blocks = pageInfo.Blocks;
+                textPage.Dispose();
+            }
+            
+            Rect imgRect = new Rect();
+            Rect txtRect = new Rect();
+            Rect vecRect = new Rect();
+            float imgArea = 0;
+            float txtArea = 0;
+            float vecArea = 0;
+            int ocrSpans = 0;
+            
+            foreach (var b in blocks)
+            {
+                // Intersect each block bbox with the page rectangle.
+                // Note that this has no effect on text because of the clipping flags,
+                // which causes that we will not see ANY clipped text.
+                Rect bbox = IntersectRects(page.Rect, b.Bbox);
+                float area = bbox.Width * bbox.Height;
+                if (area == 0.0f) // Skip any empty block
+                    continue;
+                
+                if (b.Type == 1) // Image block
+                {
+                    imgRect = JoinRects(new List<Rect> { imgRect, bbox });
+                    imgArea += area;
+                }
+                else if (b.Type == 0) // Text block
+                {
+                    if (BboxIsEmpty(b.Bbox))
+                        continue;
+                    
+                    if (b.Lines != null)
+                    {
+                        foreach (var line in b.Lines)
+                        {
+                            if (BboxIsEmpty(line.Bbox))
+                                continue;
+                            
+                            if (line.Spans != null)
+                            {
+                                foreach (var span in line.Spans)
+                                {
+                                    string text = span.Text ?? "";
+                                    if (IsWhite(text))
+                                        continue;
+                                    
+                                    Rect sr = IntersectRects(page.Rect, span.Bbox);
+                                    if (BboxIsEmpty(sr))
+                                        continue;
+                                    
+                                    // Check for OCR spans: font is "GlyphLessFont" or
+                                    // (char_flags & 8 == 0 and char_flags & 16 == 0)
+                                    // Note: CharFlags and Alpha may need to be accessed differently
+                                    // For now, check font name for OCR detection
+                                    if (span.Font == "GlyphLessFont")
+                                    {
+                                        ocrSpans++;
+                                    }
+                                    // Alpha check would need to be implemented based on available API
+                                    // Skip invisible text (alpha == 0)
+                                    
+                                    charsTotal += text.Trim().Length;
+                                    charsBad += text.Count(c => c == REPLACEMENT_CHARACTER);
+                                    txtRect = JoinRects(new List<Rect> { txtRect, sr });
+                                    txtArea += sr.Width * sr.Height;
+                                }
+                            }
+                        }
+                    }
+                }
+                else if (
+                    true
+                    && b.Type == 3 // Vector block
+                    // && b.Stroked  // Note: Stroked and IsRect may not be available
+                    && 2 < bbox.Width && bbox.Width <= 20 // Width limit for typical characters
+                    && 2 < bbox.Height && bbox.Height <= 20 // Height limit for typical characters
+                    // && !b.IsRect  // Contains curves
+                )
+                {
+                    // Potential character-like vector block
+                    vecRect = JoinRects(new List<Rect> { vecRect, bbox });
+                    vecArea += area;
+                }
+            }
+            
+            // The rectangle on page covered by some content
+            Rect covered = JoinRects(new List<Rect> { imgRect, txtRect, vecRect });
+            float coverArea = Math.Abs(covered.Width * covered.Height);
+            
+            // The area-related float values are computed as fractions of the total covered area.
+            return new Dictionary<string, object>
+            {
+                ["covered"] = covered, // Page area covered by content
+                ["img_joins"] = coverArea > 0 ? Math.Abs(imgRect.Width * imgRect.Height) / coverArea : 0, // Fraction of area of the joined images
+                ["img_area"] = coverArea > 0 ? imgArea / coverArea : 0, // Fraction of sum of image area sizes
+                ["txt_joins"] = coverArea > 0 ? Math.Abs(txtRect.Width * txtRect.Height) / coverArea : 0, // Fraction of area of the joined text spans
+                ["txt_area"] = coverArea > 0 ? txtArea / coverArea : 0, // Fraction of sum of text span bbox area sizes
+                ["vec_area"] = coverArea > 0 ? vecArea / coverArea : 0, // Fraction of sum of vector character area sizes
+                ["vec_joins"] = coverArea > 0 ? Math.Abs(vecRect.Width * vecRect.Height) / coverArea : 0, // Fraction of area of the joined vector characters
+                ["chars_total"] = charsTotal, // Count of visible characters
+                ["chars_bad"] = charsBad, // Count of Replacement Unicode characters
+                ["ocr_spans"] = ocrSpans, // Count: text spans with ignored text (render mode 3)
+            };
+        }
+    }
+}
diff --git a/MuPDF.NET4LLM/llama/PDFMarkdownReader.cs b/MuPDF.NET4LLM/llama/PDFMarkdownReader.cs
new file mode 100644
index 0000000..09fa80a
--- /dev/null
+++ b/MuPDF.NET4LLM/llama/PDFMarkdownReader.cs
@@ -0,0 +1,152 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using MuPDF.NET;
+
+namespace MuPDF.NET4LLM.Llama
+{
+    /// <summary>
+    /// LlamaIndex-compatible PDF reader using MuPDF.NET4LLM.
+    /// Ported and adapted from the Python module llama/pdf_markdown_reader.py.
+    /// Note: This is a C# implementation that provides similar functionality
+    /// to the original Python `PDFMarkdownReader`.
+    /// </summary>
+    public class PDFMarkdownReader
+    {
+        public Func<Dictionary<string, object>, Dictionary<string, object>> MetaFilter { get; set; }
+
+        public PDFMarkdownReader(Func<Dictionary<string, object>, Dictionary<string, object>> metaFilter = null)
+        {
+            MetaFilter = metaFilter;
+        }
+
+        /// <summary>
+        /// Loads list of documents from PDF file and also accepts extra information in dict format.
+        /// </summary>
+        /// <param name="filePath">
+        /// Path-like object (string or <c>Path</c>-like) pointing to the PDF file.
+        /// </param>
+        /// <param name="extraInfo">
+        /// Optional base metadata dictionary that is copied and enriched per page
+        /// (file path, page number, total pages, document metadata).
+        /// </param>
+        /// <param name="loadKwargs">
+        /// Optional keyword arguments controlling rendering:
+        /// <c>write_images</c>, <c>embed_images</c>, <c>image_path</c>,
+        /// <c>image_format</c>, <c>force_text</c>, <c>show_progress</c> – these are
+        /// forwarded to <see cref="Helpers.MuPdfRag.ToMarkdown(MuPDF.NET.Document,System.Collections.Generic.List{int},object,bool,bool,bool,bool,bool,string,string,float,string,bool,bool,bool,System.Collections.Generic.List{float},int,float,float?,string,int?,float,bool,bool,bool,bool,bool)"/>.
+        /// </param>
+        /// <returns>
+        /// A list of <see cref="LlamaIndexDocument"/> instances, one per page, whose
+        /// <see cref="LlamaIndexDocument.Text"/> contains Markdown for that page and whose
+        /// <see cref="LlamaIndexDocument.ExtraInfo"/> holds page‑level metadata.
+        /// </returns>
+        public List<LlamaIndexDocument> LoadData(
+            object filePath, // Can be Path or string
+            Dictionary<string, object> extraInfo = null,
+            Dictionary<string, object> loadKwargs = null)
+        {
+            if (filePath == null)
+                throw new ArgumentNullException(nameof(filePath));
+
+            string filePathStr = filePath is string str ? str : filePath.ToString();
+            if (!File.Exists(filePathStr))
+                throw new FileNotFoundException($"File not found: {filePathStr}");
+
+            if (extraInfo == null)
+                extraInfo = new Dictionary<string, object>();
+
+            if (loadKwargs == null)
+                loadKwargs = new Dictionary<string, object>();
+
+            // Extract text header information
+            var hdrInfo = new Helpers.IdentifyHeaders(filePathStr);
+
+            Document doc = new Document(filePathStr);
+            List<LlamaIndexDocument> docs = new List<LlamaIndexDocument>();
+
+            try
+            {
+                for (int i = 0; i < doc.PageCount; i++)
+                {
+                    docs.Add(ProcessDocPage(
+                        doc, extraInfo, filePathStr, i, hdrInfo, loadKwargs));
+                }
+            }
+            finally
+            {
+                doc.Close();
+            }
+
+            return docs;
+        }
+
+        private LlamaIndexDocument ProcessDocPage(
+            Document doc,
+            Dictionary<string, object> extraInfo,
+            string filePath,
+            int pageNumber,
+            object hdrInfo,
+            Dictionary<string, object> loadKwargs)
+        {
+            extraInfo = ProcessDocMeta(doc, filePath, pageNumber, extraInfo);
+
+            if (MetaFilter != null)
+                extraInfo = MetaFilter(extraInfo);
+
+            string text = Helpers.MuPdfRag.ToMarkdown(
+                doc,
+                pages: new List<int> { pageNumber },
+                hdrInfo: hdrInfo,
+                writeImages: loadKwargs.ContainsKey("write_images") && (bool)loadKwargs["write_images"],
+                embedImages: loadKwargs.ContainsKey("embed_images") && (bool)loadKwargs["embed_images"],
+                imagePath: loadKwargs.ContainsKey("image_path") ? (string)loadKwargs["image_path"] : "",
+                imageFormat: loadKwargs.ContainsKey("image_format") ? (string)loadKwargs["image_format"] : "png",
+                filename: filePath,
+                forceText: loadKwargs.ContainsKey("force_text") ? (bool)loadKwargs["force_text"] : true,
+                showProgress: loadKwargs.ContainsKey("show_progress") && (bool)loadKwargs["show_progress"]
+            );
+
+            return new LlamaIndexDocument
+            {
+                Text = text,
+                ExtraInfo = extraInfo
+            };
+        }
+
+        /// <summary>
+        /// Process metas of a PDF document.
+        /// </summary>
+        private Dictionary<string, object> ProcessDocMeta(
+            Document doc,
+            string filePath,
+            int pageNumber,
+            Dictionary<string, object> extraInfo)
+        {
+            if (extraInfo == null)
+                extraInfo = new Dictionary<string, object>();
+
+            // Add document metadata
+            var metadata = doc.MetaData;
+            foreach (var kvp in metadata)
+            {
+                extraInfo[kvp.Key] = kvp.Value;
+            }
+
+            extraInfo["page"] = pageNumber + 1;
+            extraInfo["total_pages"] = doc.PageCount;
+            extraInfo["file_path"] = filePath;
+
+            return extraInfo;
+        }
+    }
+
+    /// <summary>
+    /// Document structure for LlamaIndex compatibility
+    /// </summary>
+    public class LlamaIndexDocument
+    {
+        public string Text { get; set; }
+        public Dictionary<string, object> ExtraInfo { get; set; }
+    }
+}