Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,14 @@ mmif/ver
mmif/res
mmif/vocabulary
./VERSION*
VERSION
.hypothesis

# Documentation build artifacts
documentation/cli_help.rst
documentation/whatsnew.rst
docs-test

# environments
.venv*
venv*
6 changes: 3 additions & 3 deletions mmif/serialize/mmif.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import warnings
from collections import defaultdict
from datetime import datetime
from typing import List, Union, Optional, Dict, cast, Iterator
from typing import Any, List, Union, Optional, Dict, cast, Iterator

import jsonschema.validators

Expand Down Expand Up @@ -487,11 +487,11 @@ def get_documents_in_view(self, vid: Optional[str] = None) -> List[Document]:
else:
return []

def get_documents_by_type(self, doc_type: Union[str, DocumentTypes]) -> List[Document]:
def get_documents_by_type(self, doc_type: Any) -> List[Document]:
"""
Method to get all documents where the type matches a particular document type, which should be one of the CLAMS document types.

:param doc_type: the type of documents to search for, must be one of ``Document`` type defined in the CLAMS vocabulary.
:param doc_type: the type of documents to search for, must be one of ``Document`` types defined in the CLAMS vocabulary.
:return: a list of documents matching the requested type, or an empty list if none found.
"""
docs = []
Expand Down
71 changes: 71 additions & 0 deletions mmif/utils/cli/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# MMIF CLI Scripts

This directory contains CLI scripts like `source` and `rewind` that can be called from the command line. These scripts are called as subcommands of the `mmif` CLI script, for example `mmif source --help`.


## Adding another CLI script

To add a CLI script all you need to do is add a python module to `mmif/utils/cli` and make sure it has the following three methods:

1. `prep_argparser(**kwargs)` to define and return an instance of `argparse.ArgumentParser`.

2. `describe_argparser()` to return a pair of strings that describe the script. The first string is a one-line description of the argument parser and the second a more verbose description. These will be shown for `mmif --help` and `mmif subcommand --help` respectively.

3. `main(args)` to do the actual work of running the code

See the current CLI scripts for examples.


## Some background

The mmif-python package has a particular way to deal with CLI utility scripts. All scripts live in the mmif.utils.cli package. The `mmif/__init__.py` module has the `cli()` function which illustrates the requirements on utility scripts:

```python
def cli():
parser, subparsers = prep_argparser_and_subcmds()
cli_modules = {}
for cli_module in find_all_modules('mmif.utils.cli'):
cli_module_name = cli_module.__name__.rsplit('.')[-1]
cli_modules[cli_module_name] = cli_module
subcmd_parser = cli_module.prep_argparser(add_help=False)
subparsers.add_parser(cli_module_name, parents=[subcmd_parser],
help=cli_module.describe_argparser()[0],
description=cli_module.describe_argparser()[1],
formatter_class=argparse.RawDescriptionHelpFormatter)
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()
if args.subcmd not in cli_modules:
parser.print_help(sys.stderr)
else:
cli_modules[args.subcmd].main(args)
```

<!--
[cli() function](https://github.com/clamsproject/mmif-python/blob/8e6426d8d4345485fff06a0a149657e3d4fc8399/mmif/__init__.py#L47-L66)
-->

You can see the invocations of the three functions mentioned above.

The `prep_argparser()` function uses `find_all_modules()`, which finds modules in the top-level of the cli package. That module could have all the code needed for the CLI to work, but it could refer to other modules as well. For example, the `summary.py` script is in `cli`, but it imports the summary utility from `mmif.utls`.

In the setup.py script there is this passage towards the end of the file:

```python
entry_points={
'console_scripts': [
'mmif = mmif.__init__:cli',
],
},
```

This leaves it up to the `cli()` method to find the scripts and this is why just adding a submodule as mentioned above works. Note that the initialization file of the cli package imports two of the commandline related scripts:

```python
from mmif.utils.cli import rewind
from mmif.utils.cli import source
```

These may be used somewhere, but they are not necessary to run MMIF CLI scripts.

31 changes: 31 additions & 0 deletions mmif/utils/cli/summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import sys
import argparse

from mmif.utils.summarizer.summary import Summary



def describe_argparser() -> tuple:
"""
Returns two strings: a one-line description of the argparser and additional
material, which will be shown for `mmif --help` and `mmif summarize --help`,
respectively. For now they return the same string. The retun value should
still be a tuple because mmif.cli() depends on it.
"""
oneliner = 'Create a JSON Summary for a MMIF file'
return oneliner, oneliner


def prep_argparser(**kwargs):
parser = argparse.ArgumentParser(
description=describe_argparser()[1],
formatter_class=argparse.RawDescriptionHelpFormatter,
**kwargs)
parser.add_argument("-i", metavar='MMIF_FILE', help='input MMIF file', required=True)
parser.add_argument("-o", metavar='OUTPUT_FILE', help='output JSON summary file', required=True)
return parser


def main(args):
mmif_summary = Summary(args.i)
mmif_summary.report(outfile=args.o)
40 changes: 40 additions & 0 deletions mmif/utils/summarizer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@

import argparse

from mmif.utils.summarizer.summary import Summary


def argparser():
parser = argparse.ArgumentParser(description='Create a JSON Summary for a MMIF file')
parser.add_argument('-i', metavar='MMIF_FILE', help='input MMIF file', required=True)
parser.add_argument('-o', metavar='JSON_FILE', help='output JSON summary file', required=True)
return parser


def pp_args(args):
for a, v in args.__dict__.items():
print(f'{a:12s} --> {v}')


def main():
parser = argparser()
args = parser.parse_args()
#pp_args(args)
mmif_summary = Summary(args.i)
mmif_summary.report(outfile=args.o)


"""
There used to be an option to process a whole directory, but I never used it and decided
that if needed it would better be done by an extra script or a separate function.

The code for when there was a -d option is here just in case.

if args.d:
for mmif_file in pathlib.Path(args.d).iterdir():
if mmif_file.is_file() and mmif_file.name.endswith('.mmif'):
print(mmif_file)
json_file = str(mmif_file)[:-4] + 'json'
mmif_summary = Summary(mmif_file.read_text())
mmif_summary.report(outfile=json_file)
"""
69 changes: 69 additions & 0 deletions mmif/utils/summarizer/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@

from mmif.vocabulary import DocumentTypes
from mmif.vocabulary import AnnotationTypes


# The name of CLAMS applications, used to select views and to determine whether
# the summarizer is appropriate for the app version.
# TODO: this now requires an exhaustive listing of all allowed apps and their
# versions, we need a more maintainable system.

KALDI = [
# The first two use MMIF 0.4 and should probably be retired
'http://apps.clams.ai/aapb-pua-kaldi-wrapper/0.2.2',
'http://apps.clams.ai/aapb-pua-kaldi-wrapper/0.2.3',
'http://apps.clams.ai/aapb-pua-kaldi-wrapper/v3']

WHISPER = [
'http://apps.clams.ai/whisper-wrapper/v7',
'http://apps.clams.ai/whisper-wrapper/v8',
'http://apps.clams.ai/whisper-wrapper/v8-3-g737e280']

CAPTIONER = [
'http://apps.clams.ai/llava-captioner/v1.2-6-gc824c97',
'http://apps.clams.ai/smolvlm2-captioner']

NER = [
'http://apps.clams.ai/spacy-wrapper/v1.1',
'http://apps.clams.ai/spacy-wrapper/v2.1']

SEGMENTER = 'http://apps.clams.ai/audio-segmenter'


# When a named entity occurs 20 times we do not want to generate 20 instances of
# it. If the start of the next entity occurs within the below number of
# milliseconds after the end of the previous, then it is just added to the
# previous one. Taking one minute as the default so two mentions in a minute end
# up being the same instance. This setting can be changed with the 'granularity'
# parameter.
# TODO: this seems broken

GRANULARITY = 1000


# Properties used for the summary for various tags

DOC_PROPS = ('id', 'type', 'location')
VIEW_PROPS = ('id', 'timestamp', 'app')
TF_PROPS = ('id', 'start', 'end', 'frameType')
E_PROPS = ('id', 'group', 'cat', 'tag', 'video-start', 'video-end', 'coordinates')


# Names of types

TEXT_DOCUMENT = DocumentTypes.TextDocument.shortname
VIDEO_DOCUMENT = DocumentTypes.VideoDocument.shortname
TIME_FRAME = AnnotationTypes.TimeFrame.shortname
BOUNDING_BOX = AnnotationTypes.BoundingBox.shortname
ALIGNMENT = AnnotationTypes.Alignment.shortname

ANNOTATION = 'Annotation'
TOKEN = 'Token'
SENTENCE = 'Sentence'
PARAGRAPH = 'Paragraph'
NAMED_ENTITY = 'NamedEntity'
NOUN_CHUNK = 'NounChunk'
VERB_CHUNK = 'VerbChunk'

TIME_BASED_INTERVALS = {TIME_FRAME}
SPAN_BASED_INTERVALS = {TOKEN, SENTENCE, PARAGRAPH, NAMED_ENTITY, NOUN_CHUNK, VERB_CHUNK}
Loading
Loading