clamsproject · marcverhagen · Jan 14, 2026 · Dec 23, 2025 · Jan 8, 2026 · Jan 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -75,9 +75,14 @@ mmif/ver
 mmif/res
 mmif/vocabulary
 ./VERSION*
+VERSION
 .hypothesis
 
 # Documentation build artifacts
 documentation/cli_help.rst
 documentation/whatsnew.rst
 docs-test
+
+# environments
+.venv*
+venv*
diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py
@@ -15,7 +15,7 @@
 import warnings
 from collections import defaultdict
 from datetime import datetime
-from typing import List, Union, Optional, Dict, cast, Iterator
+from typing import Any, List, Union, Optional, Dict, cast, Iterator
 
 import jsonschema.validators
 
@@ -487,11 +487,11 @@ def get_documents_in_view(self, vid: Optional[str] = None) -> List[Document]:
         else:
             return []
 
-    def get_documents_by_type(self, doc_type: Union[str, DocumentTypes]) -> List[Document]:
+    def get_documents_by_type(self, doc_type: Any) -> List[Document]:
         """
         Method to get all documents where the type matches a particular document type, which should be one of the CLAMS document types.
 
-        :param doc_type: the type of documents to search for, must be one of ``Document`` type defined in the CLAMS vocabulary.
+        :param doc_type: the type of documents to search for, must be one of ``Document`` types defined in the CLAMS vocabulary.
         :return: a list of documents matching the requested type, or an empty list if none found.
         """
         docs = []

diff --git a/mmif/utils/cli/README.md b/mmif/utils/cli/README.md
@@ -0,0 +1,71 @@
+# MMIF CLI Scripts
+
+This directory contains CLI scripts like `source` and `rewind` that can be called from the command line. These scripts are called as subcommands of the `mmif` CLI script, for example `mmif source --help`.
+
+
+## Adding another CLI script
+
+To add a CLI script all you need to do is add a python module to `mmif/utils/cli` and make sure it has the following three methods:
+
+1. `prep_argparser(**kwargs)` to define and return an instance of `argparse.ArgumentParser`.
+
+2. `describe_argparser()` to return a pair of strings that describe the script. The first string is a one-line description of the argument parser and the second a more verbose description. These will be shown for `mmif --help` and `mmif subcommand --help` respectively.
+
+3. `main(args)` to do the actual work of running the code
+
+See the current CLI scripts for examples.
+
+
+## Some background
+
+The mmif-python package has a particular way to deal with CLI utility scripts. All scripts live in the mmif.utils.cli package. The `mmif/__init__.py` module has the `cli()` function which illustrates the requirements on utility scripts:
+
+```python
+def cli():
+    parser, subparsers = prep_argparser_and_subcmds()
+    cli_modules = {}
+    for cli_module in find_all_modules('mmif.utils.cli'):
+        cli_module_name = cli_module.__name__.rsplit('.')[-1]
+        cli_modules[cli_module_name] = cli_module
+        subcmd_parser = cli_module.prep_argparser(add_help=False)
+        subparsers.add_parser(cli_module_name, parents=[subcmd_parser],
+                              help=cli_module.describe_argparser()[0],
+                              description=cli_module.describe_argparser()[1],
+                              formatter_class=argparse.RawDescriptionHelpFormatter)
+    if len(sys.argv) == 1:
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+    args = parser.parse_args()
+    if args.subcmd not in cli_modules:
+        parser.print_help(sys.stderr)
+    else:
+        cli_modules[args.subcmd].main(args)
+```
+
+<!--
+[cli() function](https://github.com/clamsproject/mmif-python/blob/8e6426d8d4345485fff06a0a149657e3d4fc8399/mmif/__init__.py#L47-L66)
+-->
+
+You can see the invocations of the three functions mentioned above.
+
+The `prep_argparser()` function uses `find_all_modules()`, which finds modules in the top-level of the cli package. That module could have all the code needed for the CLI to work, but it could refer to other modules as well. For example, the `summary.py` script is in `cli`, but it imports the summary utility from `mmif.utls`.
+
+In the setup.py script there is this passage towards the end of the file:
+
+```python
+    entry_points={
+        'console_scripts': [
+            'mmif = mmif.__init__:cli',
+        ],
+    },
+```
+
+This leaves it up to the `cli()` method to find the scripts and this is why just adding a submodule as mentioned above works. Note that the initialization file of the cli package imports two of the commandline related scripts:
+
+```python
+from mmif.utils.cli import rewind
+from mmif.utils.cli import source
+```
+
+These may be used somewhere, but they are not necessary to run MMIF CLI scripts.
+
diff --git a/mmif/utils/cli/summarize.py b/mmif/utils/cli/summarize.py
@@ -0,0 +1,31 @@
+import sys
+import argparse
+
+from mmif.utils.summarizer.summary import Summary
+
+
+
+def describe_argparser() -> tuple:
+    """
+    Returns two strings: a one-line description of the argparser and additional
+    material, which will be shown for `mmif --help` and `mmif summarize --help`,
+    respectively. For now they return the same string. The retun value should 
+    still be a tuple because mmif.cli() depends on it.
+    """
+    oneliner = 'Create a JSON Summary for a MMIF file'
+    return oneliner, oneliner
+
+
+def prep_argparser(**kwargs):
+    parser = argparse.ArgumentParser(
+        description=describe_argparser()[1],
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        **kwargs)
+    parser.add_argument("-i", metavar='MMIF_FILE', help='input MMIF file', required=True)
+    parser.add_argument("-o", metavar='OUTPUT_FILE', help='output JSON summary file', required=True)
+    return parser
+
+
+def main(args):
+    mmif_summary = Summary(args.i)
+    mmif_summary.report(outfile=args.o)
diff --git a/mmif/utils/summarizer/__init__.py b/mmif/utils/summarizer/__init__.py
@@ -0,0 +1,40 @@
+
+import argparse
+
+from mmif.utils.summarizer.summary import Summary
+
+
+def argparser():
+    parser = argparse.ArgumentParser(description='Create a JSON Summary for a MMIF file')
+    parser.add_argument('-i', metavar='MMIF_FILE', help='input MMIF file', required=True)
+    parser.add_argument('-o', metavar='JSON_FILE', help='output JSON summary file', required=True)
+    return parser
+
+
+def pp_args(args):
+    for a, v in args.__dict__.items():
+        print(f'{a:12s}  -->  {v}')
+
+
+def main():
+    parser = argparser()
+    args = parser.parse_args()
+    #pp_args(args)
+    mmif_summary = Summary(args.i)
+    mmif_summary.report(outfile=args.o)
+
+
+"""
+There used to be an option to process a whole directory, but I never used it and decided
+that if needed it would better be done by an extra script or a separate function.
+
+The code for when there was a -d option is here just in case.
+
+if args.d:
+    for mmif_file in pathlib.Path(args.d).iterdir():
+        if mmif_file.is_file() and mmif_file.name.endswith('.mmif'):
+            print(mmif_file)
+            json_file = str(mmif_file)[:-4] + 'json'
+            mmif_summary = Summary(mmif_file.read_text())
+            mmif_summary.report(outfile=json_file)
+"""
diff --git a/mmif/utils/summarizer/config.py b/mmif/utils/summarizer/config.py
@@ -0,0 +1,69 @@
+
+from mmif.vocabulary import DocumentTypes
+from mmif.vocabulary import AnnotationTypes
+
+
+# The name of CLAMS applications, used to select views and to determine whether
+# the summarizer is appropriate for the app version.
+# TODO: this now requires an exhaustive listing of all allowed apps and their
+# versions, we need a more maintainable system.
+
+KALDI = [
+    # The first two use MMIF 0.4 and should probably be retired
+    'http://apps.clams.ai/aapb-pua-kaldi-wrapper/0.2.2',
+    'http://apps.clams.ai/aapb-pua-kaldi-wrapper/0.2.3',
+    'http://apps.clams.ai/aapb-pua-kaldi-wrapper/v3']
+
+WHISPER = [
+    'http://apps.clams.ai/whisper-wrapper/v7',
+    'http://apps.clams.ai/whisper-wrapper/v8',
+    'http://apps.clams.ai/whisper-wrapper/v8-3-g737e280']
+
+CAPTIONER = [
+    'http://apps.clams.ai/llava-captioner/v1.2-6-gc824c97',
+    'http://apps.clams.ai/smolvlm2-captioner']
+
+NER = [
+    'http://apps.clams.ai/spacy-wrapper/v1.1',
+    'http://apps.clams.ai/spacy-wrapper/v2.1']
+
+SEGMENTER = 'http://apps.clams.ai/audio-segmenter'
+
+
+# When a named entity occurs 20 times we do not want to generate 20 instances of
+# it. If the start of the next entity occurs within the below number of
+# milliseconds after the end of the previous, then it is just added to the
+# previous one. Taking one minute as the default so two mentions in a minute end
+# up being the same instance. This setting can be changed with the 'granularity'
+# parameter.
+# TODO: this seems broken
+
+GRANULARITY = 1000
+
+
+# Properties used for the summary for various tags
+
+DOC_PROPS = ('id', 'type', 'location')
+VIEW_PROPS = ('id', 'timestamp', 'app')
+TF_PROPS = ('id', 'start', 'end', 'frameType')
+E_PROPS = ('id', 'group', 'cat', 'tag', 'video-start', 'video-end', 'coordinates')
+
+
+# Names of types
+
+TEXT_DOCUMENT = DocumentTypes.TextDocument.shortname
+VIDEO_DOCUMENT = DocumentTypes.VideoDocument.shortname
+TIME_FRAME = AnnotationTypes.TimeFrame.shortname
+BOUNDING_BOX = AnnotationTypes.BoundingBox.shortname
+ALIGNMENT = AnnotationTypes.Alignment.shortname
+
+ANNOTATION = 'Annotation'
+TOKEN = 'Token'
+SENTENCE = 'Sentence'
+PARAGRAPH = 'Paragraph'
+NAMED_ENTITY = 'NamedEntity'
+NOUN_CHUNK = 'NounChunk'
+VERB_CHUNK = 'VerbChunk'
+
+TIME_BASED_INTERVALS = {TIME_FRAME}
+SPAN_BASED_INTERVALS = {TOKEN, SENTENCE, PARAGRAPH, NAMED_ENTITY, NOUN_CHUNK, VERB_CHUNK}