ocropus · JKamlah · Jul 26, 2019 · Jul 26, 2019 · Jul 26, 2019 · Jul 26, 2019
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@
   * [hocr-lines](#hocr-lines) -- extract the text within all the ocr_line elements
   * [hocr-merge-dc](#hocr-merge-dc) -- merge Dublin Core meta data into the hOCR HTML header
   * [hocr-pdf](#hocr-pdf) -- create a searchable PDF from a pile of hOCR and JPEG
+  * [hocr-simplify](#hocr-simplify) -- compute an simplified hOCR file
   * [hocr-split](#hocr-split) -- split an hOCR file into individual pages
   * [hocr-wordfreq](#hocr-wordfreq) -- calculate word frequency in an hOCR file
 * [Unit tests](#unit-tests)
@@ -207,6 +208,21 @@ hocr-pdf --savefile out.pdf <imgdir>
 
 Create a searchable PDF from a pile of hOCR and JPEG. It is important that the corresponding JPEG and hOCR files have the same name with their respective file ending. All of these files should lie in one directory, which one has to specify as an argument when calling the command, e.g. use `hocr-pdf . > out.pdf` to run the command in the current directory and save the output as `out.pdf` alternatively `hocr-pdf . --savefile out.pdf` which avoids routing the output through the terminal.
 
+### hocr-simplify
+
+```
+hocr-simplify [-t TYPESETTING] [-a REMOVE-ATTRIBUTES] [-c REMOVE-CHOICES] [-e REMOVE-EMPTY-CONTENTS] [-p REMOVE-PROPERTIES] input.html [output.html]
+```
+
+Compute a simplified hOCR file. If called wihtout any output path, the result is printed to the terminal.  
+Use:  
+`-t` to set a new typesetting level, lower ones will be removed, e.g. `-t page`  
+`-a` to remove attributes, it will be applied to all typesetting levels, e.g. `-a id`  
+`-c` to remove character choices  
+`-e` to remove any text content containing only whitespaces or nothing  
+`-p` to remove properties, e.g. `-p baseline`
+
+
 ### hocr-split
 
 ```

diff --git a/hocr-simplify b/hocr-simplify
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+
+# Create a simplipfied hocr-version by:
+# change level of typesetting
+# remove properties
+# remove attributes
+# remove empty contents
+# remove character alternatives (choices)
+
+
+from __future__ import print_function
+import argparse
+import re
+import os
+from io import open
+import sys
+
+from lxml import etree, html
+
+parser = argparse.ArgumentParser(
+    description=('change level of typesetting and/or'
+                 'remove properties to create'
+                 'a simplified hocr-version'))
+
+properties = ['baseline', 'bbox', 'cflow', 'cuts', 'hardbreak', 'image',
+              'imagemd5', 'lpageno', 'ppageno', 'nlp', 'order', 'poly',
+              'scan_res', 'textangle', 'x_booxes', 'x_font', 'x_fsize',
+              'x_confs', 'x_scanner', 'x_source', 'x_wconf']
+
+typesettings = ['ocrx_word', 'ocr_line', 'ocr_par', 'ocr_carea', 'ocr_page']
+
+parser.add_argument('file', nargs='?', default=sys.stdin)
+parser.add_argument('-t', '--typesetting', type=str,
+                    choices=typesettings,
+                    help='Sets a new minimum typesetting level.\n'
+                         'List of typesetting: {}'.format(','.join(typesettings)))
+parser.add_argument('-a', '--remove-attributes', nargs='+',
+                    help='Removes attributes, e.g. id')
+parser.add_argument('-c', '--remove-choices', action='store_true',
+                    help='Removes character alternatives (tesseract outputs only)')
+parser.add_argument('-e', '--remove-empty-contents', action='store_true',
+                    help='Removes contents which are empty or contains whitespaces only')
+parser.add_argument('-p', '--remove-properties', nargs='+',
+                    help='List of properties: {}'.format(','.join(properties)))
+parser.add_argument('fileout', nargs='?',
+                    help="Output path, default: print to terminal")
+parser.add_argument('-v', '--verbose',
+                    action='store_true', help='Verbose, default: %(default)s')
+
+args = parser.parse_args()
+
+with open(args.file,"r",encoding="utf-8") as f:
+    doc = html.parse(f)
+
+# delete all nodes where the id-attribute contain lstm_choices
+if args.remove_choices:
+    for node in doc.xpath('.//*[contains(@id,"lstm_choices")]'):
+        node.getparent().remove(node)
+
+# change level of typesetting
+if args.typesetting:
+    # update meta content
+    node = doc.find("//*[@name='ocr-capabilities']")
+    if node is not None:
+        content = node.get("content")
+        if content is not None and args.typesetting in content:
+            node.set("content", content.split(args.typesetting)[0] + args.typesetting)
+            if args.verbose:
+                print(node.get("content"))
+
+    # apply new level of typesetting
+    for typesetting in typesettings:
+        for node in doc.xpath("//*[@class='{}']".format(typesetting)):
+            if args.verbose and typesetting == args.typesetting:
+                print(re.sub(r'\s+', '\x20', node.text_content()).strip())
+            text_content = node.text_content()
+            seperator = "\n"
+            if "word" in typesetting:
+                seperator = ""
+            elif "line" in typesetting:
+                seperator = " "
+            node.text = seperator.join([text.strip().replace("\n","") for text in
+                                        text_content.splitlines() if
+                                        not text.strip() != "\n" and
+                                        args.remove_empty_contents or text.strip() != ""])
+            for child in list(node):
+               node.remove(child)
+        if typesetting == args.typesetting:
+            break
+
+# remove properties
+if args.remove_properties:
+    for node in doc.xpath("//*[@title]"):
+        title = node.get("title")
+        node.set('title', ';'.join([prop.replace("\"","'") for prop in
+                                    title.split(";") if
+                                    prop.strip().split(None, 1)[0] not in
+                                    args.remove_properties]))
+        if args.verbose:
+            print("Replaced :{}".format(title))
+else:
+    # Replace double quotation marks with single
+    for node in doc.xpath("//*[@title]"):
+        node.set("title",node.get("title").replace("\"","'"))
+
+# remove attributes
+if args.remove_attributes:
+    for attr in args.remove_attributes:
+        for node in doc.xpath("//*[@{}]".format(attr)):
+            node.attrib.pop("{}".format(attr))
+
+# if no output path is given, print to terminal
+if args.fileout is None:
+    encoding = "utf-8"
+    if sys.version_info[0] > 2:
+        encoding = str
+    print(etree.tostring(doc, pretty_print=True,encoding=encoding))
+
+else:
+    # create output path if needed
+    if not os.path.isdir(os.path.dirname(args.fileout)):
+        os.makedirs(os.path.dirname(args.fileout))
+
+    # write new hocr file
+    with open(args.fileout, "wb") as f:
+        f.write(etree.tostring(doc, pretty_print=True,encoding="utf-8"))
diff --git a/test/hocr-simplify/hocr-simplify.tsht b/test/hocr-simplify/hocr-simplify.tsht
@@ -0,0 +1,17 @@
+#!/usr/bin/env tsht
+TESTDATA="../testdata"
+SIMPLEFILE="./tess.simple.hocr"
+
+plan 3
+
+after () {
+    rm -f "$SIMPLEFILE"
+}
+hocr-simplify "$TESTDATA/tess.hocr" -t ocr_page > "$SIMPLEFILE" || fail 'hocr-simplify'
+equals 3268 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 3268'
+
+hocr-simplify "$TESTDATA/tess_choices.hocr" -c -t ocr_line > "$SIMPLEFILE" || fail 'hocr-simplify'
+equals 9691 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 9691'
+
+hocr-simplify "$TESTDATA/tess_choices_charboxes.hocr" -c -t ocrx_word > "$SIMPLEFILE" || fail 'hocr-simplify'
+equals 58622 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 58622'
diff --git a/test/smoke.tsht b/test/smoke.tsht
@@ -1,6 +1,6 @@
 #!/usr/bin/env tsht
 
-for f in check combine eval eval-geom eval-lines extract-g1000 extract-images lines merge-dc pdf split;do
+for f in check combine eval eval-geom eval-lines extract-g1000 extract-images lines merge-dc pdf split simplify;do
     exec_ok "hocr-$f" "--help"
     exec_ok "hocr-$f" "-h"
 done
diff --git a/test/testdata/kraken.hocr b/test/testdata/kraken.hocr
diff --git a/test/testdata/ocropus.hocr b/test/testdata/ocropus.hocr