-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhocr-to-textract.cs
More file actions
320 lines (265 loc) · 10.2 KB
/
hocr-to-textract.cs
File metadata and controls
320 lines (265 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Xml.Linq;
namespace ImageOCR
{
/// <summary>
/// Converts HOCR XHTML format to Amazon Textract JSON format
/// </summary>
public class HocrToTextractConverter
{
/// <summary>
/// Converts HOCR XHTML string to Amazon Textract JSON format
/// </summary>
/// <param name="hocrXhtml">HOCR XHTML string</param>
/// <returns>Textract-formatted JSON string</returns>
public string ConvertToTextract(string hocrXhtml)
{
var doc = XDocument.Parse(hocrXhtml);
var blocks = new List<TextractBlock>();
var blockIdCounter = 1;
// Find all pages
var pages = doc.Descendants()
.Where(e => e.Attribute("class")?.Value == "ocr_page")
.ToList();
foreach (var page in pages)
{
var pageInfo = ParseTitle(page.Attribute("title")?.Value ?? "");
int pageNumber = pages.IndexOf(page) + 1;
// Create PAGE block
var pageBlock = new TextractBlock
{
BlockType = "PAGE",
Id = $"page-{pageNumber}",
Page = pageNumber,
Geometry = CreateGeometry(pageInfo),
Relationships = new List<Relationship>
{
new Relationship { Type = "CHILD", Ids = new List<string>() }
}
};
blocks.Add(pageBlock);
// Find all lines in this page
var lines = page.Descendants()
.Where(e => e.Attribute("class")?.Value == "ocr_line")
.ToList();
foreach (var line in lines)
{
var lineInfo = ParseTitle(line.Attribute("title")?.Value ?? "");
var lineId = $"line-{blockIdCounter++}";
// Create LINE block
var lineBlock = new TextractBlock
{
BlockType = "LINE",
Id = lineId,
Page = pageNumber,
Text = GetElementText(line),
Geometry = CreateGeometry(lineInfo),
Relationships = new List<Relationship>
{
new Relationship { Type = "CHILD", Ids = new List<string>() }
}
};
// Add line to page's children
pageBlock.Relationships[0].Ids.Add(lineId);
// Find all words in this line
var words = line.Descendants()
.Where(e => e.Attribute("class")?.Value == "ocrx_word")
.ToList();
foreach (var word in words)
{
var wordInfo = ParseTitle(word.Attribute("title")?.Value ?? "");
var wordText = GetElementText(word);
// Skip empty words
if (string.IsNullOrWhiteSpace(wordText))
continue;
var wordId = $"word-{blockIdCounter++}";
// Extract confidence if available
float confidence = 0;
if (wordInfo.ContainsKey("x_wconf"))
{
float.TryParse(wordInfo["x_wconf"], out confidence);
}
// Create WORD block
var wordBlock = new TextractBlock
{
BlockType = "WORD",
Id = wordId,
Page = pageNumber,
Text = wordText,
Confidence = confidence,
Geometry = CreateGeometry(wordInfo)
};
// Add word to line's children
lineBlock.Relationships[0].Ids.Add(wordId);
blocks.Add(wordBlock);
}
blocks.Add(lineBlock);
}
}
// Create Textract response structure
var textractResponse = new TextractResponse
{
DocumentMetadata = new DocumentMetadata
{
Pages = pages.Count
},
Blocks = blocks
};
var options = new JsonSerializerOptions
{
WriteIndented = true,
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull
};
return JsonSerializer.Serialize(textractResponse, options);
}
/// <summary>
/// Parses the HOCR title attribute into a dictionary
/// </summary>
private Dictionary<string, string> ParseTitle(string title)
{
var result = new Dictionary<string, string>();
if (string.IsNullOrWhiteSpace(title))
return result;
var parts = title.Split(';');
foreach (var part in parts)
{
var trimmed = part.Trim();
var spaceIndex = trimmed.IndexOf(' ');
if (spaceIndex > 0)
{
var key = trimmed.Substring(0, spaceIndex);
var value = trimmed.Substring(spaceIndex + 1);
result[key] = value;
}
}
return result;
}
/// <summary>
/// Creates Textract geometry from HOCR bbox information
/// </summary>
private Geometry CreateGeometry(Dictionary<string, string> info)
{
if (!info.ContainsKey("bbox"))
return null;
var bbox = info["bbox"].Split(' ');
if (bbox.Length != 4)
return null;
float.TryParse(bbox[0], out float left);
float.TryParse(bbox[1], out float top);
float.TryParse(bbox[2], out float right);
float.TryParse(bbox[3], out float bottom);
// Get page dimensions if available
float pageWidth = 1000;
float pageHeight = 1000;
if (info.ContainsKey("ppageno"))
{
// Try to get actual page dimensions from image info
// For now, use defaults or you can pass these in
}
var width = right - left;
var height = bottom - top;
return new Geometry
{
BoundingBox = new BoundingBox
{
Width = width / pageWidth,
Height = height / pageHeight,
Left = left / pageWidth,
Top = top / pageHeight
},
Polygon = new List<Point>
{
new Point { X = left / pageWidth, Y = top / pageHeight },
new Point { X = right / pageWidth, Y = top / pageHeight },
new Point { X = right / pageWidth, Y = bottom / pageHeight },
new Point { X = left / pageWidth, Y = bottom / pageHeight }
}
};
}
/// <summary>
/// Gets the text content from an XML element
/// </summary>
private string GetElementText(XElement element)
{
// Get direct text content or text from child text nodes
var textContent = element.Nodes()
.OfType<XText>()
.Select(t => t.Value.Trim())
.Where(v => !string.IsNullOrWhiteSpace(v));
var result = string.Join(" ", textContent);
// If no direct text, try to get from value
if (string.IsNullOrWhiteSpace(result))
{
result = element.Value.Trim();
}
return result;
}
}
#region Textract Data Models
public class TextractResponse
{
[JsonPropertyName("DocumentMetadata")]
public DocumentMetadata DocumentMetadata { get; set; }
[JsonPropertyName("Blocks")]
public List<TextractBlock> Blocks { get; set; }
}
public class DocumentMetadata
{
[JsonPropertyName("Pages")]
public int Pages { get; set; }
}
public class TextractBlock
{
[JsonPropertyName("BlockType")]
public string BlockType { get; set; }
[JsonPropertyName("Id")]
public string Id { get; set; }
[JsonPropertyName("Page")]
public int Page { get; set; }
[JsonPropertyName("Text")]
public string Text { get; set; }
[JsonPropertyName("Confidence")]
public float? Confidence { get; set; }
[JsonPropertyName("Geometry")]
public Geometry Geometry { get; set; }
[JsonPropertyName("Relationships")]
public List<Relationship> Relationships { get; set; }
}
public class Relationship
{
[JsonPropertyName("Type")]
public string Type { get; set; }
[JsonPropertyName("Ids")]
public List<string> Ids { get; set; }
}
public class Geometry
{
[JsonPropertyName("BoundingBox")]
public BoundingBox BoundingBox { get; set; }
[JsonPropertyName("Polygon")]
public List<Point> Polygon { get; set; }
}
public class BoundingBox
{
[JsonPropertyName("Width")]
public float Width { get; set; }
[JsonPropertyName("Height")]
public float Height { get; set; }
[JsonPropertyName("Left")]
public float Left { get; set; }
[JsonPropertyName("Top")]
public float Top { get; set; }
}
public class Point
{
[JsonPropertyName("X")]
public float X { get; set; }
[JsonPropertyName("Y")]
public float Y { get; set; }
}
#endregion
}