diff --git a/.gitattributes b/.gitattributes index d4cb9c80f..b74930123 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,4 @@ databroker/_version.py export-subst .git_archival.txt export-subst +# SCM syntax highlighting & preventing 3-way merges +pixi.lock merge=binary linguist-language=YAML linguist-generated=true diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 000000000..02522b010 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,81 @@ +name: Documentation + +on: + workflow_dispatch: + pull_request: + push: + branches: + - main + release: + types: + - published + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + # Many color libraries just need this to be set to any value, but at least + # one distinguishes color depth, where "3" -> "256-bit color". + FORCE_COLOR: 3 + +jobs: + docs-build: + name: Documentation build + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - uses: astral-sh/setup-uv@v7 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install + shell: bash -l {0} + run: source continuous_integration/scripts/install.sh + + - name: Install docs requirements + shell: bash -l {0} + run: | + set -vxeuo pipefail + python -m pip install -r requirements-docs.txt + python -m pip list + + - name: Build HTML + run: sphinx-build docs/source docs/_build/html + + - name: Upload HTML as GitHub artifact + uses: actions/upload-pages-artifact@v3 + with: + path: docs/_build/html + + docs-publish: + name: Deploy documentation to GitHub Pages + runs-on: ubuntu-latest + needs: docs-build + if: ${{ github.ref_name == 'main' }} + + # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages + permissions: + contents: read + pages: write + id-token: write + + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + + steps: + - name: Setup GitHub Pages + uses: actions/configure-pages@v5 + + - name: Deploy HTML to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml index c6e1b1c3c..1b44c1513 100644 --- a/.github/workflows/publish-pypi.yml +++ b/.github/workflows/publish-pypi.yml @@ -1,4 +1,4 @@ -name: CD +name: Publish to PyPI on: workflow_dispatch: @@ -41,32 +41,3 @@ jobs: - name: Cleanup dist folder run: rm -rf dist - - dist-plugins: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - uses: hynek/build-and-inspect-python-package@v2 - with: - path: bluesky-tiled-plugins/ - upload-name-suffix: "-bluesky-tiled-plugins" - - publish-plugins: - needs: [dist-plugins] - environment: pypi - permissions: - id-token: write - runs-on: ubuntu-latest - if: github.event_name == 'release' && github.event.action == 'published' - - steps: - - - uses: actions/download-artifact@v4 - with: - name: Packages-bluesky-tiled-plugins - path: dist - - - uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 4e3c31184..dacf3ac61 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -16,7 +16,7 @@ jobs: - 27017:27017 strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12", "3.13"] fail-fast: false steps: diff --git a/.gitignore b/.gitignore index d709d569c..f80a5413f 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,6 @@ data/* # version file generated by setuptools *_version.py +# pixi environments +.pixi/* +!.pixi/config.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 47dc1c286..000000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,24 +0,0 @@ -files: ^bluesky-tiled-plugins/ -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 - hooks: - - id: check-added-large-files - - id: check-yaml - - id: check-merge-conflict - - - repo: local - hooks: - - id: ruff - name: lint with ruff - language: system - entry: ruff check --force-exclude --fix - types: [python] - require_serial: true - - - id: ruff-format - name: format with ruff - language: system - entry: ruff format --force-exclude - types: [python] - require_serial: true diff --git a/README.rst b/README.rst index c200da86c..1ef4253a7 100644 --- a/README.rst +++ b/README.rst @@ -4,9 +4,24 @@ Databroker |build_status| |coverage| |pypi_version| |license| -**The Databroker project is now in maintenance mode, and it is not recommended -for new users. It will be maintained until for years to come to support -existing user code. New users should use Bluesky Tiled Plugins.** +Deprecation Notice +================== + +Databroker is no longer recommended for new users or facilities adopting +Bluesky. Instead, `Tiled`_ with `Bluesky Tiled Plugins`_ is recommended as the +canonical way to persist and access data and metadata from Bluesky. + +Databroker now serves two purposes that remain relevant for some users and some +faciilities. First, it contains code adapting the legacy MongoDB-based Bluesky +document storage to Tiled---effectively a server-side plugin for Tiled. Second, +it wraps the Tiled Python client to provide an API backward-compatible with +legacy Databroker user code. _If you do not have MongoDB-based Bluesky storage +and you do not have legacy Databroker user code, you do not need Databroker._ + +Databroker will be maintained by NSLS-II through **April 2027** at minimum to +support the transition from MongoDB-based document storage to PostgreSQL-based +storage. The Python user interface may be maintained longer still, depending on +the need. ============== ============================================================== PyPI ``pip install databroker`` @@ -93,3 +108,7 @@ See the tutorials for more. .. _Suitcase: https://blueskyproject.io/suitcase/ .. _Intake: https://intake.readthedocs.io/en/latest/ + +.. _Bluesky Tiled Plugins: https://blueskyproject.io/bluesky-tiled-plugins/ + +.. _Tiled: https://blueskyproject.io/tiled/ diff --git a/bluesky-tiled-plugins/README.md b/bluesky-tiled-plugins/README.md deleted file mode 100644 index 416ee6893..000000000 --- a/bluesky-tiled-plugins/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# bluesky-tiled-plugins - -This is a separate Python package, `bluesky-tiled-plugins`, that is -developed in the databroker repository. - -For a user wishing to connect to a running Tiled server and access Bluesky data, -this package, along with its dependency `tiled[client]`, is all they need. - -The databroker package is only required if the user wants to use the legacy -`databroker.Broker` API. diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/__init__.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/__init__.py deleted file mode 100644 index b1a3caf60..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from .clients.bluesky_event_stream import BlueskyEventStream # noqa: F401 -from .clients.bluesky_run import BlueskyRun # noqa: F401 -from .clients.catalog_of_bluesky_runs import CatalogOfBlueskyRuns # noqa: F401 -from .writing.tiled_writer import TiledWriter # noqa: F401 - -__all__ = [ - "BlueskyEventStream", - "BlueskyRun", - "CatalogOfBlueskyRuns", - "TiledWriter", -] diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/__init__.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/_common.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/_common.py deleted file mode 100644 index 03c0f8091..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/_common.py +++ /dev/null @@ -1,6 +0,0 @@ -# There are methods that IPython will try to call. -# We special-case them because we want to avoid the getattr -# resulting in an unnecessary network hit just to raise -# AttributeError. - -IPYTHON_METHODS = {"_ipython_canary_method_should_not_exist_", "_repr_mimebundle__ipython_display_"} diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/bluesky_event_stream.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/bluesky_event_stream.py deleted file mode 100644 index 0f941268c..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/bluesky_event_stream.py +++ /dev/null @@ -1,357 +0,0 @@ -import functools -import keyword -import warnings -from collections import defaultdict -from typing import Optional - -import numpy -import xarray -from tiled.client.composite import CompositeClient -from tiled.client.container import DEFAULT_STRUCTURE_CLIENT_DISPATCH, Container -from tiled.utils import DictView, OneShotCachedMap, Sentinel, node_repr - -from ._common import IPYTHON_METHODS - -DATAVALUES = Sentinel("DATAVALUES") -TIMESTAMPS = Sentinel("TIMESTAMPS") - - -class BlueskyEventStream(Container): - _ipython_display_ = None - _repr_mimebundle__ = None - - def __new__(cls, context, *, item, structure_clients, **kwargs): - # When inheriting from BlueskyEventStream, return the class itself - if cls is not BlueskyEventStream: - return super().__new__(cls) - - # Set the version based on the specs - _cls = BlueskyEventStreamV3 if cls._is_sql(item) else BlueskyEventStreamV2Mongo - return _cls(context, item=item, structure_clients=structure_clients, **kwargs) - - @staticmethod - def _is_sql(item): - for spec in item["attributes"]["specs"]: - if spec["name"] == "BlueskyEventStream": - if spec["version"].startswith("3."): - return True - return False - - -class BlueskyEventStreamV2Mongo(BlueskyEventStream): - """ - This encapsulates the data and metadata for one 'stream' in a Bluesky 'run'. - - This adds for bluesky-specific conveniences to the standard client Container. - """ - - def __repr__(self): - stream_name = self.metadata.get("stream_name") or self.item["id"] - return f"" - - @property - def descriptors(self): - return self.metadata["descriptors"] - - @property - def _descriptors(self): - # For backward-compatibility. - # We do not normally worry about backward-compatibility of _ methods, but - # for a time databroker.v2 *only* have _descriptors and not descriptors, - # and I know there is useer code that relies on that. - warnings.warn("Use `.descriptors` instead of `._descriptors`.", stacklevel=2) - return self.descriptors - - def __getattr__(self, key): - """ - Let run.X be a synonym for run['X'] unless run.X already exists. - - This behavior is the same as with pandas.DataFrame. - """ - # The wisdom of this kind of "magic" is arguable, but we - # need to support it for backward-compatibility reasons. - if key in IPYTHON_METHODS: - raise AttributeError(key) - if key in self: - return self[key] - raise AttributeError(key) - - def __dir__(self): - # Build a list of entries that are valid attribute names - # and add them to __dir__ so that they tab-complete. - tab_completable_entries = [ - entry for entry in self if (entry.isidentifier() and (not keyword.iskeyword(entry))) - ] - return super().__dir__() + tab_completable_entries - - def read(self, *args, **kwargs): - """ - Shortcut for reading the 'data' (as opposed to timestamps or config). - - That is: - - >>> stream.read(...) - - is equivalent to - - >>> stream["data"].read(...) - """ - return self["data"].read(*args, **kwargs) - - def to_dask(self): - warnings.warn( - """Do not use this method. -Instead, set dask or when first creating the client, as in - - >>> catalog = from_uri("...", "dask") - -and then read() will return dask objects.""", - DeprecationWarning, - stacklevel=2, - ) - return self.new_variation(structure_clients=DEFAULT_STRUCTURE_CLIENT_DISPATCH["dask"]).read() - - -class BlueskyEventStreamV2SQL(OneShotCachedMap): - def __init__(self, internal_dict, metadata=None): - super().__init__(internal_dict) - self.metadata = metadata or {} - - def __repr__(self): - stream_name = self.metadata.get("stream_name") - return f"" - - def __getitem__(self, key): - if "/" in key: - key, rest = key.split("/", 1) - return self[key][rest] - - return super().__getitem__(key) - - @classmethod - def from_stream_client(cls, stream_client, metadata=None): - stream_parts = set(stream_client.base.keys()) - data_keys = [k for k in stream_parts if k != "internal"] - ts_keys = ["time"] - if "internal" in stream_parts: - internal_cols = stream_client.base["internal"].columns - data_keys += [col for col in internal_cols if col != "seq_num" and not col.startswith("ts_")] - ts_keys += [col for col in internal_cols if col.startswith("ts_")] - - # Construct clients for the configuration data - cf_vals, cf_time = defaultdict(dict), defaultdict(dict) - if config := stream_client.metadata.get("configuration", {}): - updates = stream_client.metadata.get("_config_updates", []) - for obj_name, obj in config.items(): - for key in obj["data"].keys(): - _vs, _ts = [obj["data"][key]], [obj["timestamps"][key]] - - # Add values and timestamps from config_updates - for upd in updates: - if upd_config := upd.get("configuration", {}): - _vs.append(upd_config.get("data", {}).get(key)) - _ts.append(upd_config.get("timestamps", {}).get(key)) - - cf_vals[obj_name][key] = VirtualArrayClient(_vs) - cf_time[obj_name][key] = VirtualArrayClient(_ts) - - internal_dict = { - "data": lambda: CompositeSubsetClient(stream_client, data_keys), - "timestamps": lambda: CompositeSubsetClient(stream_client, ts_keys), - "config": lambda: VirtualContainer({k: ConfigDatasetClient(v) for k, v in cf_vals.items()}), - "config_timestamps": lambda: VirtualContainer({k: ConfigDatasetClient(v) for k, v in cf_time.items()}), - } - - # Construct the metadata - metadata = { - "descriptors": [], - "stream_name": stream_client.item["id"], - **stream_client.metadata, - **(metadata or {}), - } - - return cls(internal_dict, metadata=metadata) - - @functools.cached_property - def descriptors(self): - # Go back to the BlueskyRun node and request the documents - # the path is: bs_run_node/streams/current_stream (old) or bs_run_node/current_stream (new) - bs_run_node = self["data"].parent - if bs_run_node.item["id"] == "streams" and ("BlueskyRun" not in {s.name for s in bs_run_node.specs}): - # The parent is the old "streams" node, go up one more level - bs_run_node = bs_run_node.parent - stream_name = self.metadata.get("stream_name") or self["data"].item["id"] - return [ - doc for name, doc in bs_run_node.documents() if name == "descriptor" and doc["name"] == stream_name - ] - - @property - def _descriptors(self): - # For backward-compatibility. - # We do not normally worry about backward-compatibility of _ methods, but - # for a time databroker.v2 *only* have _descriptors and not descriptors, - # and I know there is useer code that relies on that. - warnings.warn("Use `.descriptors` instead of `._descriptors`.", stacklevel=2) - return self.descriptors - - def __getattr__(self, key): - """ - Let run.X be a synonym for run['X'] unless run.X already exists. - - This behavior is the same as with pandas.DataFrame. - """ - # The wisdom of this kind of "magic" is arguable, but we - # need to support it for backward-compatibility reasons. - if key in IPYTHON_METHODS: - raise AttributeError(key) - if key in self: - return self[key] - raise AttributeError(key) - - def read(self, *args, **kwargs): - """Read the data from the stream. - - This is a shortcut for reading the 'data' (as opposed to timestamps or config). - """ - return self["data"].read(*args, **kwargs) - - -class ConfigDatasetClient(DictView): - def __repr__(self): - tiled_repr = node_repr(self, self._internal_dict.keys()) - return tiled_repr.replace(type(self).__name__, "DatasetClient") - - def read(self): - # Delay this import for fast startup. In some cases only metadata - # is handled, and we can avoid the xarray import altogether. - - d = {k: {"dims": "time", "data": v.read()} for k, v in self._internal_dict.items()} - return xarray.Dataset.from_dict(d) - - -class CompositeSubsetClient(CompositeClient): - """A composite client with only a subset of its keys exposed.""" - - def __init__(self, client, keys=None): - super().__init__(context=client.context, item=client.item, structure_clients=client.structure_clients) - self._keys = keys or list(client.keys()) - - def __repr__(self): - return node_repr(self, self._keys).replace(type(self).__name__, "DatasetClient") - - def _keys_slice(self, start, stop, direction, page_size: Optional[int] = None, **kwargs): - yield from self._keys[start : stop : -1 if direction < 0 else 1] # noqa: 203 - - def _items_slice(self, start, stop, direction, page_size: Optional[int] = None, **kwargs): - for key in self._keys[start : stop : -1 if direction < 0 else 1]: # noqa: 203 - yield key, self[key] - - def __iter__(self): - yield from self._keys - - def __getitem__(self, key): - if key in self._keys: - return super().__getitem__(key) - raise KeyError(key) - - def __len__(self): - return len(self._keys) - - def __contains__(self, key): - return key in self._keys - - def read(self, variables=None, dim0=None): - variables = set(self._keys).intersection(variables or self._keys) - - return super().read(variables, dim0=dim0) - - -class VirtualContainer(DictView): - def __repr__(self): - tiled_repr = node_repr(self, self._internal_dict.keys()) - return tiled_repr.replace(type(self).__name__, "ContainerClient") - - def __getitem__(self, key): - if "/" in key: - key, rest = key.split("/", 1) - return self[key][rest] - - return super().__getitem__(key) - - -class VirtualArrayClient: - def __init__(self, data, dims=None): - # Delay this import for fast startup. In some cases only metadata - # is handled, and we can avoid the numpy import altogether. - - # Ensure data is an array-like object - if not hasattr(data, "__iter__") or isinstance(data, str): - data = [data] - if not hasattr(data, "__array__"): - data = numpy.asanyarray(data) - - self._data = data - self._dims = dims - - def __getitem__(self, slice): - return self.read(slice) - - def __repr__(self): - attrs = {"shape": self.shape, "dtype": self.dtype} - if dims := self.dims: - attrs["dims"] = dims - return "" - - def read(self, slice=None): - return self._data if slice is None else self._data[slice] - - @property - def size(self): - return self._data.size - - @property - def shape(self): - return self._data.shape - - @property - def dtype(self): - return self._data.dtype - - @property - def dims(self): - return self._dims - - -class BlueskyEventStreamV3(BlueskyEventStream, CompositeClient): - def __repr__(self): - stream_name = self.metadata.get("stream_name") or self.item["id"] - return f"" - - @property - def _var_keys(self): - return {k for k in self if not k.startswith("ts_") and k != "seq_num"} - - @property - def _ts_keys(self): - return {k for k in self if k.startswith("ts_")} - - def read(self, variables=(DATAVALUES,), dim0=None): - if DATAVALUES in variables: - variables = self._var_keys.union(variables) - {DATAVALUES} - if TIMESTAMPS in variables: - variables = self._ts_keys.union(variables) - {TIMESTAMPS} - - return super().read(variables=variables, dim0=dim0) - - @functools.cached_property - def descriptors(self): - # Go back to the BlueskyRun node and requests the documents - stream_name = self.metadata.get("stream_name") or self.item["id"] - # the path is: bs_run_node/streams/current_stream (old) or bs_run_node/current_stream (new) - bs_run_node = self.parent - if bs_run_node.item["id"] == "streams" and ("BlueskyRun" not in {s.name for s in bs_run_node.specs}): - # The parent is the old "streams" node, go up one more level - bs_run_node = bs_run_node.parent - return [ - doc for name, doc in bs_run_node.documents() if name == "descriptor" and doc["name"] == stream_name - ] diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/bluesky_run.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/bluesky_run.py deleted file mode 100644 index 443df3476..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/bluesky_run.py +++ /dev/null @@ -1,403 +0,0 @@ -import copy -import functools -import io -import json -import keyword -import warnings -from datetime import datetime -from typing import Optional - -from tiled.client.container import Container -from tiled.client.utils import handle_error - -from ._common import IPYTHON_METHODS -from .bluesky_event_stream import BlueskyEventStreamV2SQL -from .document import DatumPage, Descriptor, Event, EventPage, Resource, Start, Stop, StreamDatum, StreamResource - -_document_types = { - "start": Start, - "stop": Stop, - "event": Event, - "descriptor": Descriptor, - "event_page": EventPage, - "datum_page": DatumPage, - "resource": Resource, - "stream_resource": StreamDatum, - "stream_datum": StreamResource, -} - - -class BlueskyRun(Container): - _ipython_display_ = None - _repr_mimebundle_ = None - - def __new__(cls, context, *, item, structure_clients, **kwargs): - # When inheriting from BlueskyRun, return the class itself - if cls is not BlueskyRun: - return super().__new__(cls) - - # Set the version based on the specs - _cls = BlueskyRunV3 if cls._is_sql(item) else BlueskyRunV2Mongo - return _cls(context, item=item, structure_clients=structure_clients, **kwargs) - - @staticmethod - def _is_sql(item): - for spec in item["attributes"]["specs"]: - if spec["name"] == "BlueskyRun": - if spec["version"].startswith("3."): - return True - return False - - def __repr__(self): - metadata = self.metadata - datetime_ = datetime.fromtimestamp(metadata["start"]["time"]) - return ( - f"" - ) - - @property - def start(self): - """ - The Run Start document. A convenience alias: - - >>> run.start is run.metadata["start"] - True - """ - return self.metadata["start"] - - @property - def stop(self): - """ - The Run Stop document. A convenience alias: - - >>> run.stop is run.metadata["stop"] - True - """ - return self.metadata["stop"] - - @functools.cached_property - def descriptors(self): - return [doc for name, doc in self.documents() if name == "descriptor"] - - def __getattr__(self, key): - """ - Let run.X be a synonym for run['X'] unless run.X already exists. - - This behavior is the same as with pandas.DataFrame. - """ - # The wisdom of this kind of "magic" is arguable, but we - # need to support it for backward-compatibility reasons. - if key in IPYTHON_METHODS: - raise AttributeError(key) - if key in self: - return self[key] - raise AttributeError(key) - - def __dir__(self): - # Build a list of entries that are valid attribute names - # and add them to __dir__ so that they tab-complete. - tab_completable_entries = [ - entry for entry in self if (entry.isidentifier() and (not keyword.iskeyword(entry))) - ] - return super().__dir__() + tab_completable_entries - - def describe(self): - "For back-compat with intake-based BlueskyRun" - warnings.warn( - "This will be removed. Use .metadata directly instead of describe()['metadata'].", - DeprecationWarning, - stacklevel=2, - ) - return {"metadata": self.metadata} - - def __call__(self): - warnings.warn( - "Do not call a BlueskyRun. For now this returns self, for " - "backward-compatibility. but it will be removed in a future " - "release.", - DeprecationWarning, - stacklevel=2, - ) - return self - - def read(self): - raise NotImplementedError( - "Reading any entire run is not supported. Access a stream in this run and read that." - ) - - @property - def base(self): - "Return the base Container client instead of a BlueskyRun client" - return Container( - self.context, - item=self.item, - structure_clients=self.structure_clients, - queries=self._queries, - sorting=self._sorting, - include_data_sources=self._include_data_sources, - ) - - to_dask = read - - -class BlueskyRunV2(BlueskyRun): - """A MongoDB-native layout of BlueskyRuns - - This layout has been in use prior to the introduction of SQL backend in May 2025. - """ - - _version = "2.0" - - def __new__(cls, context, *, item, structure_clients, **kwargs): - # When inheriting, return the class itself - if cls is not BlueskyRunV2: - return super().__new__(cls, context, item=item, structure_clients=structure_clients, **kwargs) - - _cls = BlueskyRunV2SQL if cls._is_sql(item) else BlueskyRunV2Mongo - return _cls(context, item=item, structure_clients=structure_clients, **kwargs) - - @property - def v1(self): - "Accessor to legacy interface." - from databroker.v1 import Broker, Header - - db = Broker(self) - header = Header(self, db) - return header - - @property - def v2(self): - return self - - @property - def v3(self): - if not self._is_sql(self.item): - raise NotImplementedError("v3 is not available for MongoDB-based BlueskyRun") - - structure_clients = copy.copy(self.structure_clients) - structure_clients.set("BlueskyRun", lambda: BlueskyRunV3) - return BlueskyRunV3(self.context, item=self.item, structure_clients=structure_clients) - - -class BlueskyRunV2Mongo(BlueskyRunV2): - def documents(self, fill=False): - if fill == "yes": - fill = True - elif fill == "no": - fill = False - elif fill == "delayed": - raise NotImplementedError("fill='delayed' is not supported") - else: - fill = bool(fill) - link = self.item["links"]["self"].replace("/metadata", "/documents", 1) - with self.context.http_client.stream( - "GET", - link, - params={"fill": fill}, - headers={"Accept": "application/json-seq"}, - ) as response: - if response.is_error: - response.read() - handle_error(response) - tail = "" - for chunk in response.iter_bytes(): - for line in chunk.decode().splitlines(keepends=True): - if line[-1] == "\n": - item = json.loads(tail + line) - yield (item["name"], _document_types[item["name"]](item["doc"])) - tail = "" - else: - tail += line - if tail: - item = json.loads(tail) - yield (item["name"], _document_types[item["name"]](item["doc"])) - - -class _BlueskyRunSQL(BlueskyRun): - """A base class for a BlueskyRun that is backed by a SQL database. - - This class implements the SQL-specific method for accessing the stream of - Bluesky documents. It is not intended to be used directly, but rather as a - base class for other classes (v2 and v3) that implement additional methods. - """ - - @functools.cached_property - def _has_streams_namespace(self) -> bool: - """Determine whether the BlueskyRun has an intermediate "streams" namespace. - - Maintained for backward compatibility. Returns True if the following conditions are met: - 1. There is a "streams" key in the base container. - 2. The specs of the "streams" container do not include "BlueskyEventStream", - indicating that "streams" is not itself a BlueskyEventStream. - """ - return ("streams" in self.base) and ( - "BlueskyEventStream" not in {s.name for s in self.base["streams"].specs} - ) - - @functools.cached_property - def _stream_names(self) -> list[str]: - """Get the sorted list of stream names in the BlueskyRun. - - This property accounts for both the new layout (without "streams" namespace) - and the old layout (with "streams" namespace), in which case the stream names - are derived from the keys under the "streams" namespace. - """ - - return sorted(k for k in (self.base["streams"] if self._has_streams_namespace else self.base)) - - def __getitem__(self, key): - if isinstance(key, tuple): - key = "/".join(key) - - base_class = super() # The base Container class - - def _base_getitem(key): - # Try to get the item directly from the new container layout. Consider nested keys. - try: - return base_class.__getitem__(key) - except KeyError as e: - try: - # The requested key might be a column in the "internal" table - key = key.split("/") - key.insert(-1, "internal") - return base_class.__getitem__("/".join(key)) - except KeyError: - raise KeyError(f"Key '{key[-1]}' not found in the BlueskyRun container") from e - - # Back-compatibility for old versions of BlueskyRun layout that included 'streams' namespace. - # This takes into account the possibility of an actual BlueskyEventStream to be named 'streams'. - try: - return _base_getitem(key) - except KeyError as e: - if key == "streams": - warnings.warn( - "Looks like you are trying to access the 'streams' namespace, " - "but there is no 'streams' namespace in this BlueskyRun, which follows the new layout. " - "Please use the stream names directly, e.g. run['primary'] instead of run['streams/primary'].", - DeprecationWarning, - stacklevel=2, - ) - return self - elif key.split("/")[0] != "streams": - try: - result = _base_getitem("streams/" + key) - warnings.warn( - f"Key '{key}' not found directly in the BlueskyRun container. " - "Trying to access it via the 'streams' namespace for backward-compatibility. " - "This behavior is deprecated and will be removed in a future release. " - "Please consider migrating the catalog structure to the new layout.", - DeprecationWarning, - stacklevel=2, - ) - return result - except KeyError: - raise KeyError from e - elif key.split("/")[0] == "streams": - try: - result = _base_getitem(key[len("streams/") :]) # noqa - warnings.warn( - f"Looks like you are trying to access '{key}' via a 'streams' namespace, " - "but there is no 'streams' namespace in this BlueskyRun, which follows the new layout. " - f"Please access the stream directly, e.g. run['{key}'] instead of run['streams/{key}'].", - DeprecationWarning, - stacklevel=2, - ) - return result - except KeyError: - raise KeyError from e - else: - raise KeyError from e - - def _keys_slice(self, start, stop, direction, page_size: Optional[int] = None, **kwargs): - sorted_keys = reversed(self._stream_names) if direction < 0 else self._stream_names - return (yield from sorted_keys[start:stop]) - - def _items_slice(self, start, stop, direction, page_size: Optional[int] = None, **kwargs): - sorted_keys = reversed(self._stream_names) if direction < 0 else self._stream_names - for key in sorted_keys[start:stop]: - yield key, self[key] - return - - def __iter__(self): - yield from self._stream_names - - def documents(self, fill=False): - with io.BytesIO() as buffer: - self.export(buffer, format="application/json-seq") - buffer.seek(0) - for line in buffer: - parsed = json.loads(line.decode().strip()) - yield parsed["name"], _document_types[parsed["name"]](parsed["doc"]) - - -class BlueskyRunV2SQL(BlueskyRunV2, _BlueskyRunSQL): - def __getitem__(self, key): - # For v2, we need to handle the streams and configs keys specially - if isinstance(key, tuple): - key = "/".join(key) - - key, *rest = key.split("/", 1) - - if key == "streams": - raise KeyError( - "Looks like you are trying to access the 'streams' namespace, " - "but this pathway has never been supported in the .v2 BlueskyRun client. " - "Please access the stream directly, e.g. run['primary']." - ) - - stream_composite_client = super().__getitem__(key) - stream_container = BlueskyEventStreamV2SQL.from_stream_client(stream_composite_client) - - return stream_container[rest[0]] if rest else stream_container - - -class BlueskyRunV3(_BlueskyRunSQL): - """A BlueskyRun that is backed by a SQL database.""" - - _version = "3.0" - - def __new__(cls, context, *, item, structure_clients, **kwargs): - # When inheriting, return the class itself - if cls is not BlueskyRunV3 or cls._is_sql(item): - return super().__new__(cls, context, item=item, structure_clients=structure_clients, **kwargs) - else: - return BlueskyRunV2Mongo(context, item=item, structure_clients=structure_clients, **kwargs) - - def __getattr__(self, key): - # A shortcut to the stream data - if key in self._stream_names: - return self["streams"][key] if self._has_streams_namespace else self[key] - - return super().__getattr__(key) - - def __repr__(self): - metadata = self.metadata - datetime_ = datetime.fromtimestamp(metadata["start"]["time"]) - return ( - f"" - ) - - @property - def v1(self): - "Access to legacy interface" - return self.v2.v1 - - @property - def v2(self): - structure_clients = copy.copy(self.structure_clients) - structure_clients.set("BlueskyRun", lambda: BlueskyRunV2) - return BlueskyRunV2(self.context, item=self.item, structure_clients=structure_clients) - - @property - def v3(self): - return self diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/catalog_of_bluesky_runs.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/catalog_of_bluesky_runs.py deleted file mode 100644 index 4473bede3..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/catalog_of_bluesky_runs.py +++ /dev/null @@ -1,236 +0,0 @@ -import collections.abc -import copy -import functools -import numbers -import operator -from typing import Any - -from tiled.client.container import Container -from tiled.client.utils import handle_error -from tiled.queries import Comparison, Eq, Like -from tiled.utils import safe_json_dump - -from ..queries import RawMongo, ScanIDRange, TimeRange, _PartialUID, _ScanID -from .bluesky_run import BlueskyRunV2, BlueskyRunV3 - - -class CatalogOfBlueskyRuns(Container): - """ - This adds some bluesky-specific conveniences to the standard client Container. - - >>> catalog.scan_id[1234] # scan_id lookup - >>> catalog.uid["9acjef"] # (partial) uid lookup - >>> catalog[1234] # automatically do scan_id lookup for positive integer - >>> catalog["9acjef"] # automatically do (partial) uid lookup for string - >>> catalog[-5] # automatically do catalog.values()[-N] for negative integer - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.scan_id = IndexCallable(self._lookup_by_scan_id) - self.uid = IndexCallable(self._lookup_by_partial_uid) - self._v1 = None - - def __repr__(self): - # This is a copy/paste of the general-purpose implementation - # tiled.utils.node_repr - # with some modifications to extract scan_id from the metadata. - sample = self.items()[:10] - # Use scan_id (int) if defined; otherwise fall back to uid. - sample_reprs = [repr(value.metadata.get("start", {}).get("scan_id", key)) for key, value in sample] - out = " 60: # character count - break - out += ", " + sample_repr - counter += 1 - approx_len = operator.length_hint(self) # cheaper to compute than len(node) - # Are there more in the node that what we displayed above? - if approx_len > counter: - out += f", ...}} ~{approx_len} entries>" - else: - out += "}>" - return out - - @property - def v1(self): - "Accessor to legacy interface." - if self._v1 is None: - from databroker.v1 import Broker - - self._v1 = Broker(self) - self._v1._version = "1.0" - return self._v1 - - @property - def v2(self): - structure_clients = copy.copy(self.structure_clients) - structure_clients.set("BlueskyRun", lambda: BlueskyRunV2) - return CatalogOfBlueskyRuns(self.context, item=self.item, structure_clients=structure_clients) - - @property - def v3(self): - if not self.is_sql: - raise NotImplementedError("v3 is only available for SQL-based catalogs.") - - structure_clients = copy.copy(self.structure_clients) - structure_clients.set("BlueskyRun", lambda: BlueskyRunV3) - return CatalogOfBlueskyRuns(self.context, item=self.item, structure_clients=structure_clients) - - @functools.cached_property - def is_sql(self): - for spec in self.specs: - if spec.name == "CatalogOfBlueskyRuns": - if spec.version and spec.version.startswith("3."): - return True - return False - - def __getitem__(self, key): - # For convenience and backward-compatiblity reasons, we support - # some "magic" here that is helpful in an interactive setting. - if isinstance(key, str): - # CASE 1: Interpret key as a uid or partial uid. - if len(key) == 36: - # This looks like a full uid. Try direct lookup first. - try: - return super().__getitem__(key) - except KeyError: - # Fall back to partial uid lookup below. - pass - return self._lookup_by_partial_uid(key) - elif isinstance(key, numbers.Integral): - if key > 0: - # CASE 2: Interpret key as a scan_id. - return self._lookup_by_scan_id(int(key)) - else: - # CASE 3: Interpret key as a recently lookup, as in - # `catalog[-1]` is the latest entry. - key = int(key) - return self.values()[key] - elif isinstance(key, slice): - if (key.start is None) or (key.start >= 0): - raise ValueError( - "For backward-compatibility reasons, slicing here " - "is limited to negative indexes. " - "Use .values() to slice how you please." - ) - return self.values()[key] - elif isinstance(key, collections.abc.Iterable): - # We know that isn't a str because we check that above. - # Recurse. - return [self[item] for item in key] - else: - raise ValueError("Indexing expects a string, an integer, or a collection of strings and/or integers.") - - def _lookup_by_scan_id(self, scan_id): - results = self.search(Eq("start.scan_id", scan_id)) - if not results: - raise KeyError(f"No match for scan_id={scan_id}") - else: - # Return latest match. - return results.values().last() - - def _lookup_by_partial_uid(self, partial_uid): - if len(partial_uid) < 5: - raise ValueError(f"Partial uid {partial_uid!r} is too short. It must include at least 5 characters.") - if self.is_sql: - query = Like("start.uid", f"{partial_uid}%") - else: - query = _PartialUID(partial_uids=[partial_uid]) - results = self.search(query).values().head(2) - if len(results) > 1: - raise ValueError( - f"Partial uid {partial_uid} has multiple matches. Include more characters to get a unique match." - ) - if not results: - raise KeyError(f"No match for partial_uid {partial_uid}") - # There is one unique result. - return results[0] - - def get_serializer(self): - from tiled.server.app import get_root_tree - - if not hasattr(self.context.http_client, "app"): - raise NotImplementedError("Only works on local application.") - tree = self.context.http_client.app.dependency_overrides[get_root_tree]() - return tree.get_serializer() - - def search(self, query): - # These query types were formerly handled server side by specially-registered - # queries. Now that are transformed client side into generic queries that - # come standard with the Tiled server. - - # Some need to be expressed as a chain of queries. - if isinstance(query, TimeRange): - result = self - if query.since: - result = Container.search(result, Comparison("ge", "start.time", query.since)) - if query.until: - result = Container.search(result, Comparison("lt", "start.time", query.until)) - # For backward-compatiblity, accept a dict and interpret it as a Mongo - # query against the 'start' documents. - elif isinstance(query, _ScanID): - if len(query.scan_ids) > 1: - raise ValueError("Search on multiple ScanIDs in one query is no longer supported.") - (scan_id,) = query.scan_ids - query = Eq("start.scan_id", int(scan_id)) - result = super().search(query) - elif isinstance(query, _PartialUID): - if len(query.partial_uids) > 1: - raise ValueError("Search on multiple PartialUIDs in one query is no longer supported.") - (partial_uid,) = query.partial_uids - if self.is_sql: - query = Like("start.uid", f"{partial_uid}%") - else: - query = _PartialUID(partial_uids=[partial_uid]) - result = super().search(query) - elif isinstance(query, ScanIDRange): - ge = Comparison("ge", "start.scan_id", query.start_id) - lt = Comparison("lt", "start.scan_id", query.end_id) - result = super().search(ge).search(lt) - elif isinstance(query, dict): - query = RawMongo(start=query) - result = super().search(query) - else: - if hasattr(query, "key"): - if not query.key.startswith("start.") or query.key.startswith("stop."): - # Default to searching RunStart document. - query = copy.copy(query) - query.key = f"start.{query.key}" - result = super().search(query) - return result - - def post_document(self, name, doc): - link = self.item["links"]["self"].replace("/metadata", "/documents", 1) - response = self.context.http_client.post(link, content=safe_json_dump({"name": name, "doc": doc})) - handle_error(response) - - -class IndexCallable: - """ - DEPRECATED and no longer used internally - - Provide getitem syntax for functions - - >>> def inc(x): - ... return x + 1 - - >>> I = IndexCallable(inc) - >>> I[3] - 4 - - Vendored from dask - """ - - __slots__ = ("fn",) - - def __init__(self, fn: Any) -> None: - self.fn = fn - - def __getitem__(self, key: str) -> Any: - return self.fn(key) diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/document.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/document.py deleted file mode 100644 index 15669e2a5..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/clients/document.py +++ /dev/null @@ -1,175 +0,0 @@ -import copy - -from dask.base import normalize_token - - -class NotMutable(Exception): - pass - - -class Document(dict): - """ - Document is an immutable dict subclass. - - It is immutable to help consumer code avoid accidentally corrupting data - that another part of the consumer code was expected to use unchanged. - - Subclasses of Document must define __dask_tokenize__. The tokenization - schemes typically uniquely identify the document based on only a subset of - its contents, and mutating the contents can thereby create situations where - two unequal objects have colliding tokens. Immutability helps guard against - this too. - - Note that Documents are not *recursively* immutable. Just as it is possible - create a tuple (immutable) of lists (mutable) and mutate the lists, it is - possible to mutate the internal contents of a Document, but this should not - be done. It is safer to use the to_dict() method to create a mutable deep - copy. - - This is implemented as a dict subclass in order to satisfy certain - consumers that expect an object that satisfies isinstance(obj, dict). - This implementation detail may change in the future. - """ - - __slots__ = ("__not_a_real_dict",) - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # This lets pickle recognize that this is not a literal dict and that - # it should respect its custom __setstate__. - self.__not_a_real_dict = True - - def __repr__(self): - # same as dict, but wrapped in the class name so the eval round-trips - return f"{self.__class__.__name__}({dict(self)})" - - def _repr_pretty_(self, p, cycle): - """ - A multi-line but eval-able text repr with readable indentation - - This hooks into IPython/Jupyter's display mechanism - This is *not* invoked by print() or repr(), but it is invoked by - IPython.display.display() which is called in this common scenario:: - - In [1]: doc = Document(...) - In [2]: doc - - """ - # Note: IPython's pretty-prettying mechanism is custom and complex. - # The `text` method used below is a direct and blunt way to engage it - # and seems widely used in the IPython code base. There are other - # specific mechanisms for displaying collections like dicts, but they - # can *truncate* which I think we want to avoid and they would require - # more investment to understand how to use. - from pprint import pformat - - return p.text(f"{self.__class__.__name__}({pformat(dict(self))})") - - def __getstate__(self): - return dict(self) - - def __setstate__(self, state): - dict.update(self, state) - self.__not_a_real_dict = True - - def __readonly(self, *args, **kwargs): - raise NotMutable( - "Documents are not mutable. Call the method to_dict() to make a " - "fully independent and mutable deep copy." - ) - - def __setitem__(self, key, value): - try: - self.__not_a_real_dict # noqa: B018 - except AttributeError: - # This path is necessary to support un-pickling. - return dict.__setitem__(self, key, value) - else: - self.__readonly() - - __delitem__ = __readonly - pop = __readonly - popitem = __readonly - clear = __readonly - setdefault = __readonly - update = __readonly - - def to_dict(self): - """ - Create a mutable deep copy. - """ - # Convert to dict and then make a deep copy to ensure that if the user - # mutates any internally nested dicts there is no spooky action at a - # distance. - return copy.deepcopy(dict(self)) - - def __deepcopy__(self, memo): - # Without this, copy.deepcopy(Document(...)) fails because deepcopy - # creates a new, empty Document instance and then tries to add items to - # it. - return self.__class__({k: copy.deepcopy(v, memo) for k, v in self.items()}) - - def __dask_tokenize__(self): - raise NotImplementedError - - -# We must use dask's registration mechanism to tell it to treat Document -# specially. Dask's tokenization dispatch mechanism discovers that Docuemnt is -# a dict subclass and treats it as a dict, ignoring its __dask_tokenize__ -# method. To force it to respect our cutsom tokenization, we must explicitly -# register it. - - -@normalize_token.register(Document) -def tokenize_document(instance): - return instance.__dask_tokenize__() - - -class Start(Document): - def __dask_tokenize__(self): - return ("start", self["uid"]) - - -class Stop(Document): - def __dask_tokenize__(self): - return ("stop", self["uid"]) - - -class Resource(Document): - def __dask_tokenize__(self): - return ("resource", self["uid"]) - - -class StreamResource(Document): - def __dask_tokenize__(self): - return ("stream_resource", self["uid"]) - - -class Descriptor(Document): - def __dask_tokenize__(self): - return ("descriptor", self["uid"]) - - -class Event(Document): - def __dask_tokenize__(self): - return ("event", self["uid"]) - - -class EventPage(Document): - def __dask_tokenize__(self): - return ("event_page", self["uid"]) - - -class Datum(Document): - def __dask_tokenize__(self): - return ("datum", self["datum_id"]) - - -class StreamDatum(Document): - def __dask_tokenize__(self): - return ("stream_datum", self["uid"]) - - -class DatumPage(Document): - def __dask_tokenize__(self): - return ("datum_page", self["uid"]) diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/exporters.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/exporters.py deleted file mode 100644 index 195c2f7e4..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/exporters.py +++ /dev/null @@ -1,156 +0,0 @@ -import copy -import json -from collections import defaultdict - - -async def json_seq_exporter(mimetype, adapter, metadata, filter_for_access): - """Export BlueskyRun as newline-delimited sequence of JSON documents. - - This callback is to be configured on the server-side to enable exporting - BlueskyRun objects in JSON-Seq format. - - The resulting stream yields strings, each of which is a JSON document - representing one of the standard Bluesky documents: start, descriptor, - event, stream_resource, stream_datum, and stop, in the appropriate order. - - For example: - - ``` - {"name": "start", "doc": {...}} - {"name": "descriptor", "doc": {...}} - {"name": "event", "doc": {...}} - {"name": "stream_resource", "doc": {...}} - {"name": "stream_datum", "doc": {...}} - ... - {"name": "stop", "doc": {...}} - ``` - """ - for spec in adapter.specs: - if spec.name == "BlueskyRun" and spec.version.startswith("3."): - break - else: - raise ValueError("This exporter only works with BlueskyRun v3.x") - - adapter = await filter_for_access(adapter) - yield json.dumps({"name": "start", "doc": metadata.get("start", {})}) - result = [] - - # Generate descriptors - stream_names = await adapter.keys_range(offset=0, limit=None) - if "streams" in stream_names: - # Check for backward compatibility with the old layout (with an intermediate "streams" node) - streams_adapter = await adapter.lookup_adapter(["streams"]) - if "BlueskyEventStream" not in {s.name for s in streams_adapter.specs}: - adapter = streams_adapter - stream_names = await adapter.keys_range(offset=0, limit=None) - - for desc_name in stream_names: - desc_node = await adapter.lookup_adapter([desc_name]) - desc_meta = desc_node.metadata() - part_names = set(await desc_node.keys_range(offset=0, limit=None)) # Composite parts - - # First (or the only) descriptor - desc_doc = {k: v for k, v in desc_meta.items() if k not in {"_config_updates"}} - desc_doc["run_start"] = metadata.get("start", {}).get("uid") - desc_doc["name"] = desc_name - desc_doc["object_keys"] = defaultdict(list) - for key, val in desc_doc["data_keys"].items(): - if obj_name := val.get("object_name"): - desc_doc["object_keys"][obj_name].append(key) - - result.append({"name": "descriptor", "doc": desc_doc}) - - # Process subsequent descriptors, if any - desc_time_uids = [{"uid": desc_doc["uid"], "time": desc_doc["time"]}] - for upd in desc_meta.get("_config_updates", []): - desc_doc = copy.deepcopy(desc_doc) - desc_doc["uid"] = upd["uid"] - desc_doc["time"] = upd["time"] - desc_time_uids.extend([{"uid": desc_doc["uid"], "time": desc_doc["time"]}]) - for obj_name, obj in upd.get("configuration", {}).items(): - # This assumes that that the full configuration was present in the first descriptor - for key in obj["data"].keys(): - desc_doc["configuration"][obj_name]["data"][key] = obj["data"][key] - desc_doc["configuration"][obj_name]["timestamps"][key] = obj["timestamps"][key] - - result.append({"name": "descriptor", "doc": desc_doc}) - - # Generate events - if "internal" in part_names: - internal_node = await desc_node.lookup_adapter(["internal"]) - df = await internal_node.read() - keys = [k for k in df.columns if k not in {"seq_num", "time"} and not k.startswith("ts_")] - for row in df.to_dict(orient="records"): - desc_uid = desc_time_uids[0]["uid"] # same as desc_node.metadata()["uid"] if no updates - for _desc_uid_time in desc_time_uids[1:]: - if _desc_uid_time["time"] <= row["time"]: - desc_uid = _desc_uid_time["uid"] - event_doc = {"seq_num": row["seq_num"], "time": row["time"]} - event_doc["uid"] = f"event-{desc_uid}-{row['seq_num']}" # can be anything (unique) - event_doc["descriptor"] = desc_uid - event_doc["data"] = {k: row[k].tolist() if hasattr(row[k], "__array__") else row[k] for k in keys} - event_doc["timestamps"] = {k: row[f"ts_{k}"] for k in keys} - result.append({"name": "event", "doc": event_doc}) - - # Generate Stream Resources and Datums - desc_uid = desc_node.metadata()["uid"] - for data_key in part_names.difference(("internal",)): - # Loop over data_keys for external data only - sres_uid = f"sr-{desc_uid}-{data_key}" # can be anything (unique) - ds = (await desc_node.lookup_adapter([data_key])).data_sources[0] - uri = ds.assets[0].data_uri - for ast in ds.assets: - if ast.parameter in {"data_uris", "data_uri"}: - uri = ast.data_uri - break - sres_doc = { - "data_key": data_key, - "uid": sres_uid, - "run_start": metadata.get("start", {}).get("uid"), - "mimetype": ds.mimetype, - "parameters": ds.parameters, - "uri": uri, - } - result.append({"name": "stream_resource", "doc": sres_doc}) - - # Generate a single stream_datum document for the entire stream - sdat_uid = f"sd-{desc_uid}-{data_key}-0" # can be anything (unique) - total_shape = ds.structure.shape - datum_shape = desc_node.metadata()["data_keys"][data_key]["shape"] - - max_indx = ( - total_shape[0] // datum_shape[0] - 1 - if len(total_shape) == len(datum_shape) - else total_shape[0] - 1 - ) - sdat_doc = { - "uid": sdat_uid, - "stream_resource": sres_uid, - "descriptor": desc_uid, - "indices": {"start": 0, "stop": max_indx}, - "seq_nums": {"start": 1, "stop": max_indx + 1}, - } - result.append({"name": "stream_datum", "doc": sdat_doc}) - - # Make sure that the order of documents is (approximately) correct - result = sorted( - result, - key=lambda x: ( - x["doc"].get("time", float("inf")), - {"stream_resource": 0, "stream_datum": 1}.get(x["name"]), - ), - ) - - # Combine events into event_pages - # if modules_available("databroker"): - # from databroker.mongo_normalized import batch_documents - # - # result = [ - # {"name": x[0], "doc": x[1]} - # for x in batch_documents([(y["name"], y["doc"]) for y in result], size=1000) - # ] - - for doc in result: - yield "\n" + json.dumps(doc) - - yield "\n" + json.dumps({"name": "stop", "doc": metadata.get("stop", {})}) diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/queries.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/queries.py deleted file mode 100644 index e26bf3608..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/queries.py +++ /dev/null @@ -1,312 +0,0 @@ -""" -This module defines objects designed to make queries on -CatalogOfBlueskyRuns convenient. - -Older clients used these query object to issue custom query types. -This requires servers to register custom implementations of those -query types. - -Newer clients use these object as pure client-side conveniences. In -`CatalogOfBlueskyRuns.search` method, they are decomposed into standard Tiled -queries, requiring no custom counterpart on the server. - -The registration and serialization aspects are (temporarily) retained in order -to support older clients querying against MongoDB-backed servers. -""" - -import enum -import warnings -from dataclasses import asdict, dataclass -from typing import Optional - -# Not all of these are used, but import them all -# for user convenience so everything can be imported from bluesky_tiled_plugins.queries -from tiled.queries import ( # noqa: F401 - Comparison, - Contains, - Eq, - FullText, - In, - Key, - NotEq, - NotIn, - Operator, - QueryValueError, - Regex, -) -from tiled.query_registration import register - - -class Duplicates(str, enum.Enum): - latest = "latest" - all = "all" - error = "error" - - -@register(name="scan_id") -@dataclass -class _ScanID: - """ - Find matches to scan_id(s). - """ - - scan_ids: list[int] - duplicates: Duplicates - - def __init__(self, *, scan_ids, duplicates): - self.scan_ids = scan_ids - self.duplicates = Duplicates(duplicates) - - def encode(self): - return { - "scan_ids": ",".join(str(scan_id) for scan_id in self.scan_ids), - "duplicates": self.duplicates.value, - } - - @classmethod - def decode(cls, *, scan_ids, duplicates): - return cls( - scan_ids=[int(scan_id) for scan_id in scan_ids.split(",")], - duplicates=Duplicates(duplicates), - ) - - -def ScanID(*scan_ids, duplicates="latest"): - # Wrap _ScanID to provide a nice usage for *one or more scan_ids*: - # >>> ScanID(5) - # >>> ScanID(5, 6, 7) - # Placing a varargs parameter (*scan_ids) in the dataclass constructor - # would cause trouble on the server side and generally feels "wrong" - # so we have this wrapper function instead. - return _ScanID(scan_ids=scan_ids, duplicates=duplicates) - - -@register(name="scan_id_range") -@dataclass -class ScanIDRange: - """ - Find scans in the range. - """ - - start_id: int - end_id: int - duplicates: Duplicates - - def __init__(self, start_id, end_id, duplicates="latest"): - self.start_id = start_id - self.end_id = end_id - self.duplicates = Duplicates(duplicates) - - def encode(self): - return { - "start_id": self.start_id, - "end_id": self.end_id, - "duplicates": self.duplicates.value, - } - - @classmethod - def decode(cls, *, start_id, end_id, duplicates="latest"): - return cls( - start_id=int(start_id), - end_id=int(end_id), - duplicates=Duplicates(duplicates), - ) - - -@register(name="partial_uid") -@dataclass -class _PartialUID: - """ - Find matches to (partial) uid(s). - """ - - partial_uids: list[str] - - def encode(self): - return {"partial_uids": ",".join(str(uid) for uid in self.partial_uids)} - - @classmethod - def decode(cls, *, partial_uids): - return cls(partial_uids=partial_uids.split(",")) - - -def PartialUID(*partial_uids): - # See comment above with ScanID and _ScanID. Same thinking here. - return _PartialUID(partial_uids) - - -def RawMongo(start): - """ - DEPRECATED - - Raw MongoDB queries are no longer supported. If it is possible to express - the import as a supported query, we transform it and warn. If not, we raise - an error. - """ - - if len(start) == 1: - ((key, value),) = start.items() - if not isinstance(value, dict): - # We can transform this into a simple query. - warnings.warn( - """RawMongo will not be supported -in a future release of databroker, and its functionality has been limited. -Instead, use: - - Key("{key}") == {value!r} -""", - stacklevel=2, - ) - return Key(key) == value - raise ValueError( - """Arbitrary MongoDB queries no longer supported. - -If this is critical to you, please open an issue at - - https://github.com/bluesky/databroker - -describing your use case and we will see what we can work out.""" - ) - - -# human friendly timestamp formats we'll parse -_TS_FORMATS = [ - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d %H:%M", # these 2 are not as originally doc'd, - "%Y-%m-%d %H", # but match previous pandas behavior - "%Y-%m-%d", - "%Y-%m", - "%Y", -] - - -def _normalize_human_friendly_time(val, tz): - """Given one of : - - string (in one of the formats below) - - datetime (eg. datetime.now()), with or without tzinfo) - - timestamp (eg. time.time()) - return a timestamp (seconds since jan 1 1970 UTC). - - Non string/datetime values are returned unaltered. - Leading/trailing whitespace is stripped. - Supported formats: - {} - """ - # {} is placeholder for formats; filled in after def... - - from datetime import datetime - - import pytz - - zone = pytz.timezone(tz) # tz as datetime.tzinfo object - epoch = pytz.UTC.localize(datetime(1970, 1, 1)) - check = True - - if isinstance(val, str): - # unix 'date' cmd format '%a %b %d %H:%M:%S %Z %Y' works but - # doesn't get TZ? - - # Could cleanup input a bit? remove leading/trailing [ :,-]? - # Yes, leading/trailing whitespace to match pandas behavior... - # Actually, pandas doesn't ignore trailing space, it assumes - # the *current* month/day if they're missing and there's - # trailing space, or the month is a single, non zero-padded digit.?! - val = val.strip() - - for fmt in _TS_FORMATS: - try: - ts = datetime.strptime(val, fmt) - break - except ValueError: - pass - - try: - if isinstance(ts, datetime): - val = ts - check = False - else: - # what else could the type be here? - raise TypeError(f"expected datetime, got {ts:r}") - - except NameError: - raise ValueError("failed to parse time: " + repr(val)) from None - - if check and not isinstance(val, datetime): - return val - - if val.tzinfo is None: - # is_dst=None raises NonExistent and Ambiguous TimeErrors - # when appropriate, same as pandas - val = zone.localize(val, is_dst=None) - - return (val - epoch).total_seconds() - - -@register(name="time_range") -@dataclass -class TimeRange: - """ - A search query representing a time range. - - Parameters - ---------- - since, until: dates gives as timestamp, datetime, or human-friendly string, optional - timezone : string - As in, 'US/Eastern'. If None is given, tzlocal is used. - - Examples - -------- - Any granularity (year, month, date, hour, minute, second) is accepted. - - >>> TimeRange(since='2014') - - >>> TimeRange(until='2019-07') - - >>> TimeRange(since='2014-07-04', until='2020-07-04') - - >>> TimeRange(since='2014-07-04 05:00') - - """ - - timezone: str - since: Optional[float] = None - until: Optional[float] = None - - def __init__(self, *, timezone=None, since=None, until=None): - # Stash the raw values just for use in the repr. - self._raw_since = since - self._raw_until = until - - if timezone is None: - import tzlocal - - lz = tzlocal.get_localzone() - try: - timezone = lz.key - except AttributeError: - timezone = lz.zone - self.timezone = timezone - if since is None: - self.since = None - else: - self.since = _normalize_human_friendly_time(since, tz=self.timezone) - if until is None: - self.until = None - else: - self.until = _normalize_human_friendly_time(until, tz=self.timezone) - if since is not None and until is not None: - if self.since > self.until: - raise ValueError("since must not be greater than until.") - - def __repr__(self): - return ( - f"{type(self).__name__!s}(" - f"timezone={self.timezone!r}, since={self._raw_since!r}, until={self._raw_until!r})" - ) - - def encode(self): - return asdict(self) - - @classmethod - def decode(cls, *, timezone, since=None, until=None): - return cls(timezone=timezone, since=since, until=until) diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/utils.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/utils.py deleted file mode 100644 index 7c4e374de..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/utils.py +++ /dev/null @@ -1,20 +0,0 @@ -import collections.abc - - -def truncate_json_overflow(data): - """Truncate large numerical values to avoid overflow issues when serializing as JSON. - - This preemptively truncates large integers and floats with zero fractional part to fit within - the JSON limits for integers, i.e. (-2^53, 2^53 - 1], in case the values are implicitly - converted during serialization. - """ - if isinstance(data, collections.abc.Mapping): - return {k: truncate_json_overflow(v) for k, v in data.items()} - elif isinstance(data, collections.abc.Iterable) and not isinstance(data, str): - # Handle lists, tuples, arrays, etc., but not strings - return [truncate_json_overflow(item) for item in data] - elif isinstance(data, (int, float)) and not (data % 1) and not (1 - 2**53 <= data <= 2**53 - 1): - return min(max(data, 1 - 2**53), 2**53 - 1) # Truncate integers to fit in JSON (53 bits max) - elif isinstance(data, float) and (data < -1.7976e308 or data > 1.7976e308): - return min(max(data, -1.7976e308), 1.7976e308) # (Approx.) truncate floats to fit in JSON to avoid inf - return data diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/__init__.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/_dispatcher.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/_dispatcher.py deleted file mode 100644 index f901b418f..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/_dispatcher.py +++ /dev/null @@ -1,371 +0,0 @@ -""" -This module vendors the Dispatcher from bluesky.run_engine, in order -to avoid a bluesky dependency, since bluesky-tiled-plugins is frequently -used in data analysis environments where a bluesky dependency would be -superfluous. - -This dispatcher could in the future be move upstream to event_model -where it could be shared by bluesky and bluesky-tiled-plugins. - -That code has been extremely stable for about ten years, so divergence -is not a pressing concern. -""" - -import sys -import types -from itertools import count -from warnings import warn -from weakref import WeakKeyDictionary, ref - -from event_model import DocumentNames - - -class Dispatcher: - """Dispatch documents to user-defined consumers on the main thread.""" - - def __init__(self): - self.cb_registry = CallbackRegistry(allowed_sigs=DocumentNames) - self._counter = count() - self._token_mapping = dict() # noqa: C408 - - def process(self, name, doc): - """ - Dispatch document ``doc`` of type ``name`` to the callback registry. - - Parameters - ---------- - name : {'start', 'descriptor', 'event', 'stop'} - doc : dict - """ - exceptions = self.cb_registry.process(name, name.name, doc) - for exc, traceback in exceptions: # noqa: B007 - warn( # noqa: B028 - "A %r was raised during the processing of a %s " # noqa: UP031 - "Document. The error will be ignored to avoid " - "interrupting data collection. To investigate, " - "set RunEngine.ignore_callback_exceptions = False " - "and run again." % (exc, name.name) - ) - - def subscribe(self, func, name="all"): - """ - Register a callback function to consume documents. - - .. versionchanged :: 0.10.0 - The order of the arguments was swapped and the ``name`` - argument has been given a default value, ``'all'``. Because the - meaning of the arguments is unambiguous (they must be a callable - and a string, respectively) the old order will be supported - indefinitely, with a warning. - - .. versionchanged :: 0.10.0 - The order of the arguments was swapped and the ``name`` - argument has been given a default value, ``'all'``. Because the - meaning of the arguments is unambiguous (they must be a callable - and a string, respectively) the old order will be supported - indefinitely, with a warning. - - Parameters - ---------- - func: callable - expecting signature like ``f(name, document)`` - where name is a string and document is a dict - name : {'all', 'start', 'descriptor', 'event', 'stop'}, optional - the type of document this function should receive ('all' by - default). - - Returns - ------- - token : int - an integer ID that can be used to unsubscribe - - See Also - -------- - :meth:`Dispatcher.unsubscribe` - an integer token that can be used to unsubscribe - """ - if callable(name) and isinstance(func, str): - name, func = func, name - warn( # noqa: B028 - "The order of the arguments has been changed. Because the " - "meaning of the arguments is unambiguous, the old usage will " - "continue to work indefinitely, but the new usage is " - "encouraged: call subscribe(func, name) instead of " - "subscribe(name, func). Additionally, the 'name' argument " - "has become optional. Its default value is 'all'." - ) - if name == "all": - private_tokens = [] - for key in DocumentNames: - private_tokens.append(self.cb_registry.connect(key, func)) - public_token = next(self._counter) - self._token_mapping[public_token] = private_tokens - return public_token - - name = DocumentNames[name] - private_token = self.cb_registry.connect(name, func) - public_token = next(self._counter) - self._token_mapping[public_token] = [private_token] - return public_token - - def unsubscribe(self, token): - """ - Unregister a callback function using its integer ID. - - Parameters - ---------- - token : int - the integer ID issued by :meth:`Dispatcher.subscribe` - - See Also - -------- - :meth:`Dispatcher.subscribe` - """ - for private_token in self._token_mapping.pop(token, []): - self.cb_registry.disconnect(private_token) - - def unsubscribe_all(self): - """Unregister all callbacks from the dispatcher.""" - for public_token in list(self._token_mapping.keys()): - self.unsubscribe(public_token) - - @property - def ignore_exceptions(self): - return self.cb_registry.ignore_exceptions - - @ignore_exceptions.setter - def ignore_exceptions(self, val): - self.cb_registry.ignore_exceptions = val - - -class CallbackRegistry: - """ - See matplotlib.cbook.CallbackRegistry. This is a simplified since - ``bluesky`` is python3.4+ only! - """ - - def __init__(self, ignore_exceptions=False, allowed_sigs=None): - self.ignore_exceptions = ignore_exceptions - self.allowed_sigs = allowed_sigs - self.callbacks = dict() # noqa: C408 - self._cid = 0 - self._func_cid_map = {} - - def __getstate__(self): - # We cannot currently pickle the callables in the registry, so - # return an empty dictionary. - return {} - - def __setstate__(self, state): - # re-initialise an empty callback registry - self.__init__() - - def connect(self, sig, func): - """Register ``func`` to be called when ``sig`` is generated - - Parameters - ---------- - sig - func - - Returns - ------- - cid : int - The callback index. To be used with ``disconnect`` to deregister - ``func`` so that it will no longer be called when ``sig`` is - generated - """ - if self.allowed_sigs is not None: - if sig not in self.allowed_sigs: - raise ValueError(f"Allowed signals are {self.allowed_sigs}") - self._func_cid_map.setdefault(sig, WeakKeyDictionary()) - # Note proxy not needed in python 3. - # TODO rewrite this when support for python2.x gets dropped. - # Following discussion with TC: weakref.WeakMethod can not be used to - # replace the custom 'BoundMethodProxy', because it does not accept - # the 'destroy callback' as a parameter. The 'destroy callback' is - # necessary to automatically unsubscribe CB registry from the callback - # when the class object is destroyed and this is the main purpose of - # BoundMethodProxy. - proxy = _BoundMethodProxy(func) - if proxy in self._func_cid_map[sig]: - return self._func_cid_map[sig][proxy] - - proxy.add_destroy_callback(self._remove_proxy) - self._cid += 1 - cid = self._cid - self._func_cid_map[sig][proxy] = cid - self.callbacks.setdefault(sig, dict()) # noqa: C408 - self.callbacks[sig][cid] = proxy - return cid - - def _remove_proxy(self, proxy): - # need the list because `del self._func_cid_map[sig]` mutates the dict - for sig, proxies in list(self._func_cid_map.items()): - try: - # Here we need to delete the last reference to proxy (in 'self.callbacks[sig]') - # The respective entries in 'self._func_cid_map' are deleted automatically, - # since 'self._func_cid_map[sig]' entries are WeakKeyDictionary objects. - del self.callbacks[sig][proxies[proxy]] - except KeyError: - pass - - # Remove dictionary items for signals with no assigned callbacks - if len(self.callbacks[sig]) == 0: - del self.callbacks[sig] - del self._func_cid_map[sig] - - def disconnect(self, cid): - """Disconnect the callback registered with callback id *cid* - - Parameters - ---------- - cid : int - The callback index and return value from ``connect`` - """ - for eventname, callbackd in self.callbacks.items(): # noqa: B007 - try: - # This may or may not remove entries in 'self._func_cid_map'. - del callbackd[cid] - except KeyError: - continue - else: - # Look for cid in 'self._func_cid_map' as well. It may still be there. - for sig, functions in self._func_cid_map.items(): # noqa: B007 - for function, value in list(functions.items()): - if value == cid: - del functions[function] - return - - def process(self, sig, *args, **kwargs): - """Process ``sig`` - - All of the functions registered to receive callbacks on ``sig`` - will be called with ``args`` and ``kwargs`` - - Parameters - ---------- - sig - args - kwargs - """ - if self.allowed_sigs is not None: - if sig not in self.allowed_sigs: - raise ValueError(f"Allowed signals are {self.allowed_sigs}") - exceptions = [] - if sig in self.callbacks: - for cid, func in list(self.callbacks[sig].items()): # noqa: B007 - try: - func(*args, **kwargs) - except ReferenceError: - self._remove_proxy(func) - except Exception as e: - if self.ignore_exceptions: - exceptions.append((e, sys.exc_info()[2])) - else: - raise - return exceptions - - -class _BoundMethodProxy: - """ - Our own proxy object which enables weak references to bound and unbound - methods and arbitrary callables. Pulls information about the function, - class, and instance out of a bound method. Stores a weak reference to the - instance to support garbage collection. - @organization: IBM Corporation - @copyright: Copyright (c) 2005, 2006 IBM Corporation - @license: The BSD License - Minor bugfixes by Michael Droettboom - """ - - def __init__(self, cb): - self._hash = hash(cb) - self._destroy_callbacks = [] - try: - # This branch is successful if 'cb' bound method and class method, - # but destroy_callback mechanism works only for bound methods, - # since cb.__self__ points to class instance only for - # bound methods, not for class methods. Therefore destroy_callback - # will not be called for class methods. - try: - self.inst = ref(cb.__self__, self._destroy) - except TypeError: - self.inst = None - self.func = cb.__func__ - self.klass = cb.__self__.__class__ - - except AttributeError: - # 'cb' is a function, callable object or static method. - # No weak reference is created, strong reference is stored instead. - self.inst = None - self.func = cb - self.klass = None - - def add_destroy_callback(self, callback): - self._destroy_callbacks.append(_BoundMethodProxy(callback)) - - def _destroy(self, wk): - for callback in self._destroy_callbacks: - try: - callback(self) - except ReferenceError: - pass - - def __getstate__(self): - d = self.__dict__.copy() - # de-weak reference inst - inst = d["inst"] - if inst is not None: - d["inst"] = inst() - return d - - def __setstate__(self, statedict): - self.__dict__ = statedict - inst = statedict["inst"] - # turn inst back into a weakref - if inst is not None: - self.inst = ref(inst) - - def __call__(self, *args, **kwargs): - """ - Proxy for a call to the weak referenced object. Take - arbitrary params to pass to the callable. - Raises `ReferenceError`: When the weak reference refers to - a dead object - """ - if self.inst is not None and self.inst() is None: - raise ReferenceError - elif self.inst is not None: - # build a new instance method with a strong reference to the - # instance - - mtd = types.MethodType(self.func, self.inst()) - - else: - # not a bound method, just return the func - mtd = self.func - # invoke the callable and return the result - return mtd(*args, **kwargs) - - def __eq__(self, other): - """ - Compare the held function and instance with that held by - another proxy. - """ - try: - if self.inst is None: - return self.func == other.func and other.inst is None - else: - return self.func == other.func and self.inst() == other.inst() - except Exception: - return False - - def __ne__(self, other): - """ - Inverse of __eq__. - """ - return not self.__eq__(other) - - def __hash__(self): - return self._hash diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/_json_writer.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/_json_writer.py deleted file mode 100644 index 1564fad2a..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/_json_writer.py +++ /dev/null @@ -1,66 +0,0 @@ -import json -from datetime import datetime -from pathlib import Path -from typing import Optional - -# NOTE: This code is duplicated in src/bluesky/callbacks/json_writer.py - - -class JSONWriter: - """Writer of Bluesky documents of a single run into a JSON file as an array. - - The file is created when a Start document is received, each new document is - written immediately, and the JSON array is closed when the "stop" document - is received. - """ - - def __init__( - self, - dirname: str, - filename: Optional[str] = None, - ): - self.dirname = Path(dirname) - self.filename = filename - - def __call__(self, name, doc): - if name == "start": - self.filename = self.filename or f"{doc['uid'].split('-')[0]}.json" - with open(self.dirname / self.filename, "w") as file: - file.write("[\n") - json.dump({"name": name, "doc": doc}, file) - file.write(",\n") - - elif name == "stop": - with open(self.dirname / self.filename, "a") as file: - json.dump({"name": name, "doc": doc}, file) - file.write("\n]") - - else: - with open(self.dirname / self.filename, "a") as file: - json.dump({"name": name, "doc": doc}, file) - file.write(",\n") - - -class JSONLinesWriter: - """Writer of Bluesky documents into a JSON Lines file - - If the file already exists, new documents will be appended to it. - """ - - def __init__(self, dirname: str, filename: Optional[str] = None): - self.dirname = Path(dirname) - self.filename = filename - - def __call__(self, name, doc): - if not self.filename: - if name == "start": - # If the first document is a start document, use the uid to create a filename - self.filename = f"{doc['uid'].split('-')[0]}.jsonl" - else: - # If the first document is not a start document, use the current date - self.filename = f"{datetime.today().strftime('%Y-%m-%d')}.jsonl" - mode = "a" if (self.dirname / self.filename).exists() else "w" - - with open(self.dirname / self.filename, mode) as file: - json.dump({"name": name, "doc": doc}, file) - file.write("\n") diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/consolidators.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/consolidators.py deleted file mode 100644 index 91de1328c..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/consolidators.py +++ /dev/null @@ -1,596 +0,0 @@ -import collections -import dataclasses -import os -import re -import warnings -from typing import Literal, Union, cast - -import numpy as np -from event_model.documents import EventDescriptor, StreamDatum, StreamResource -from tiled.mimetypes import DEFAULT_ADAPTERS_BY_MIMETYPE -from tiled.structures.array import ArrayStructure, BuiltinDtype, StructDtype -from tiled.structures.core import StructureFamily -from tiled.structures.data_source import Asset, DataSource, Management - - -@dataclasses.dataclass -class Patch: - shape: tuple[int, ...] - offset: tuple[int, ...] - - @classmethod - def combine_patches(cls, patches: list["Patch"]) -> "Patch": - """Combine multiple patches into a single patch - - The combined patch covers the union (smallest bounding box) of all provided patches. - - Parameters - ---------- - patches : list[Patch] - A list of Patch objects to combine. - - Returns - ------- - Patch - A new Patch object that covers the union of all input patches. - """ - - # Determine the overall shape and offset - min_offset = list(patches[0].offset) - max_extent = [offset + size for offset, size in zip(patches[0].offset, patches[0].shape)] - - for patch in patches[1:]: - for i in range(len(min_offset)): - min_offset[i] = min(min_offset[i], patch.offset[i]) - max_extent[i] = max(max_extent[i], patch.offset[i] + patch.shape[i]) - - combined_shape = tuple(max_e - min_o for min_o, max_e in zip(min_offset, max_extent)) - combined_offset = tuple(min_offset) - - return cls(shape=combined_shape, offset=combined_offset) - - -class ConsolidatorBase: - """Consolidator of StreamDatums - - A Consolidator consumes documents from RE; it is similar to usual Bluesky Handlers but is designed to work - with streaming data (received via StreamResource and StreamDatum documents). It composes details (DataSource - and its Assets) that will go into the Tiled database. Each Consolidator is instantiated per a Stream Resource. - - Tiled Adapters will later use this to read the data, with good random access and bulk access support. - - We put this code into consolidators so that additional, possibly very unusual, formats can be supported by - users without getting a PR merged into Bluesky or Tiled. - - The CONSOLIDATOR_REGISTRY (see example below) and the Tiled catalog parameter adapters_by_mimetype can be used - together to support: - - Ingesting a new mimetype from Bluesky documents and generating DataSource and Asset with appropriate - parameters (the consolidator's job); - - Interpreting those DataSource and Asset parameters to do I/O (the adapter's job). - - To implement new Consolidators for other mimetypes, subclass ConsolidatorBase, possibly expand the - `consume_stream_datum` and `get_data_source` methods, and ensure that the keys of returned `adapter_parameters` - dictionary matches the expected adapter signature. Declare a set of supported mimetypes to allow validation and - automated discovery of the subclassed Consolidator. - - Attributes: - ----------- - - supported_mimetypes : set[str] - a set of mimetypes that can be handled by a derived Consolidator class; raises ValueError if attempted to - pass Resource documents related to unsupported mimetypes. - join_method : Literal["stack", "concat"] - a method to join the data; if "stack", the resulting consolidated dataset is produced by joining all datums - along a new dimension added on the left, e.g. a stack of tiff images, otherwise -- datums will be appended - to the end of the existing leftmost dimension, e.g. rows of a table (similarly to concatenation in numpy). - join_chunks : bool - if True, the chunking of the resulting dataset will be determined after consolidation, otherwise each part - is considered to be chunked separately. - """ - - supported_mimetypes: set[str] = {"application/octet-stream"} - join_method: Literal["stack", "concat"] = "concat" - join_chunks: bool = True - - def __init__(self, stream_resource: StreamResource, descriptor: EventDescriptor): - self.mimetype = self.get_supported_mimetype(stream_resource) - - self.data_key = stream_resource["data_key"] - self.uri = stream_resource["uri"] - self.assets: list[Asset] = [Asset(data_uri=self.uri, is_directory=False, parameter="data_uris", num=0)] - self._sres_parameters = stream_resource["parameters"] - - # Find datum shape and machine dtype - data_desc = descriptor["data_keys"][self.data_key] - if None in data_desc["shape"]: - raise NotImplementedError(f"Consolidator for {self.mimetype} does not support variable-sized data") - self.datum_shape: tuple[int, ...] = cast(tuple[int, ...], tuple(data_desc["shape"])) - self.datum_shape = () if self.datum_shape == (1,) and self.join_method == "stack" else self.datum_shape - - # Check that the datum shape is consistent between the StreamResource and the Descriptor - if multiplier := self._sres_parameters.get("multiplier"): - self.datum_shape = self.datum_shape or (multiplier,) # If datum_shape is not set - if self.datum_shape[0] != multiplier: - if self.datum_shape[0] == 1: - self.datum_shape = (multiplier,) + self.datum_shape[1:] - else: - self.datum_shape = (multiplier,) + self.datum_shape - # TODO: Check consistency with chunk_shape - - # Determine the machine data type; fall back to np.dtype("float64") if not set - self.data_type: Union[BuiltinDtype, StructDtype] - dtype_descr = data_desc.get("dtype_numpy") - if isinstance(dtype_descr, list): - # np.dtype requires tuples in struct dtypes, not lists - self.data_type = StructDtype.from_numpy_dtype(np.dtype(list(map(tuple, dtype_descr)))) - else: - self.data_type = BuiltinDtype.from_numpy_dtype(np.dtype(dtype_descr)) - - # Set chunk (or partition) shape - self.chunk_shape = self._sres_parameters.get("chunk_shape", ()) - if any(d <= 0 for d in self.chunk_shape): - raise ValueError(f"Chunk size in all dimensions must be at least 1: chunk_shape={self.chunk_shape}.") - - # Possibly overwrite the join_method and join_chunks attributes - self.join_method = self._sres_parameters.get("join_method", self.join_method) - self.join_chunks = self._sres_parameters.get("join_chunks", self.join_chunks) - - self._num_rows: int = 0 # Number of rows in the Data Source (all rows, includung skips) - self._seqnums_to_indices_map: dict[int, int] = {} - - # Set the dimension names if provided - self.dims: tuple[str, ...] = tuple(data_desc.get("dims", ())) - - @classmethod - def get_supported_mimetype(cls, sres): - if (cls is not ConsolidatorBase) and (sres["mimetype"] not in cls.supported_mimetypes): - raise ValueError(f"A data source of {sres['mimetype']} type can not be handled by {cls.__name__}.") - return sres["mimetype"] - - @property - def shape(self) -> tuple[int, ...]: - """Native shape of the data stored in assets - - This includes the leading (0th) dimension corresponding to the number of rows (if the join_method is stack) - including skipped rows, if any. The number of relevant usable data rows may be lower, which is determined - by the `seq_nums` field of StreamDatum documents.""" - - if (self.join_method == "concat") and len(self.datum_shape) > 0: - return self._num_rows * self.datum_shape[0], *self.datum_shape[1:] - - return self._num_rows, *self.datum_shape - - @property - def chunks(self) -> tuple[tuple[int, ...], ...]: - """Explicit (dask-style) specification of chunk sizes - - The produced chunk specification is a tuple of tuples of int that specify the sizes of each chunk in each - dimension; it is based on the StreamResource parameter `chunk_shape`. - - If `chunk_shape` is an empty tuple -- assume the dataset is stored as a single chunk for all existing and - new elements. Usually, however, `chunk_shape` is a tuple of int, in which case, we assume fixed-sized - chunks with at most `chunk_shape[0]` elements (i.e. `_num_rows`); last chunk can be smaller. If chunk_shape - is a tuple with less than `self.shape` elements -- assume it defines the chunk sizes along the leading - dimensions. - - If the joining method is "concat", and `join_chunks = False`, the chunking along the leftmost dimensions - is assumed to be preserved in each appended data point, i.e. consecutive chunks do not join, e.g. for a 1d - array with chunks (3,3,1), the resulting chunking after 3 repeats is (3,3,1,3,3,1,3,3,1). - When `join_chunks = True` (default), the chunk size along the leftmost dimension is determined by the - chunk_shape parameter; this is the case when `join_method == "stack"` well. - Chunking along the trailing dimensions is always preserved as in the original (single) array. - """ - - def list_summands(A: int, b: int, repeat: int = 1) -> tuple[int, ...]: - # Generate a list with repeated b summing up to A; append the remainder if necessary - # e.g. list_summands(13, 3) = [3, 3, 3, 3, 1] - # if `repeat = n`, n > 1, copy and repeat the entire result n times - return tuple([b] * (A // b) + ([A % b] if A % b > 0 else [])) * repeat or (0,) - - # If chunk shape is less than or equal to the total shape dimensions, chunk each specified dimension - # starting from the leading dimension - if len(self.chunk_shape) <= len(self.shape): - if ( - self.join_method == "stack" - or (self.join_method == "concat" and self.join_chunks) - or len(self.chunk_shape) == 0 - ): - result = tuple( - list_summands(ddim, cdim) - for ddim, cdim in zip(self.shape[: len(self.chunk_shape)], self.chunk_shape) - ) - else: - result = ( - list_summands(self.datum_shape[0], self.chunk_shape[0], repeat=self._num_rows), - *[ - list_summands(ddim, cdim) - for ddim, cdim in zip(self.shape[1 : len(self.chunk_shape)], self.chunk_shape[1:]) # noqa - ], - ) - return result + tuple((d,) for d in self.shape[len(self.chunk_shape) :]) # noqa: E203 - - # If chunk shape is longer than the total shape dimensions, raise an error - else: - raise ValueError( - f"The shape of chunks, {self.chunk_shape}, should be less than or equal to the shape of data, " - f"{self.shape}." - ) - - @property - def has_skips(self) -> bool: - """Indicates whether any rows should be skipped when mapping their indices to frame numbers - - This flag is intended to provide a shortcut for more efficient data access when there are no skips, and the - mapping between indices and seq_nums is straightforward. In other case, the _seqnums_to_indices_map needs - to be taken into account. - """ - return self._num_rows > len(self._seqnums_to_indices_map) - - def adapter_parameters(self) -> dict: - """A dictionary of parameters passed to an Adapter - - These parameters are intended to provide any additional information required to read a data source of a - specific mimetype, e.g. "path" the path into an HDF5 file or "template" the filename pattern of a TIFF - sequence. - - This method is to be subclassed as necessary. - """ - return {} - - def structure(self) -> ArrayStructure: - return ArrayStructure( - data_type=self.data_type, - shape=self.shape, - chunks=self.chunks, - dims=self.dims if self.dims else None, - ) - - def consume_stream_datum(self, doc: StreamDatum): - """Process a new StreamDatum and update the internal data structure - - This will be called for every new StreamDatum received to account for the new added rows. - This method _may need_ to be subclassed and expanded depending on a specific mimetype. - Actions: - - Parse the fields in a new StreamDatum - - Increment the number of rows (implemented by the Base class) - - Keep track of the correspondence between indices and seq_nums (implemented by the Base class) - - Update the list of assets, including their uris, if necessary - - Update shape and chunks - """ - old_shape = self.shape # Adding new rows updates self.shape - self._num_rows += doc["indices"]["stop"] - doc["indices"]["start"] - new_seqnums = range(doc["seq_nums"]["start"], doc["seq_nums"]["stop"]) - new_indices = range(doc["indices"]["start"], doc["indices"]["stop"]) - self._seqnums_to_indices_map.update(dict(zip(new_seqnums, new_indices))) - return Patch( - offset=(old_shape[0], *[0 for _ in self.shape[1:]]), - shape=(self.shape[0] - old_shape[0], *self.shape[1:]), - ) - - def get_data_source(self) -> DataSource: - """Return a DataSource object reflecting the current state of the streamed dataset. - - The returned DataSource is conceptually similar (and can be an instance of) tiled.structures.DataSource. In - general, it describes associated Assets (filepaths, mimetype) along with their internal data structure - (array shape, chunks, additional parameters) and should contain all information necessary to read the file. - """ - return DataSource( - mimetype=self.mimetype, - assets=self.assets, - structure_family=StructureFamily.array, - structure=self.structure(), - parameters=self.adapter_parameters(), - management=Management.external, - ) - - def init_adapter(self, adapter_class=None): - """Initialize a Tiled Adapter for reading the consolidated data - - Parameters - ---------- - adapter_class : Optional[Type[Adapter]] - An optional Adapter class to use for initialization; if not provided, the default adapter for the - Consolidator's mimetype will be used. - """ - - adapter_class = adapter_class or DEFAULT_ADAPTERS_BY_MIMETYPE.get(self.mimetype) - if not adapter_class: - raise ValueError(f"No adapter found for mimetype {self.mimetype}") - - # Mimic the necessary aspects of a Tiled node with a namedtuple - _Node = collections.namedtuple("Node", ["metadata_", "specs"]) - return adapter_class.from_catalog(self.get_data_source(), _Node({}, []), **self.adapter_parameters()) - - def update_from_stream_resource(self, stream_resource: StreamResource): - """Consume an additional related StreamResource document for the same data_key""" - - raise NotImplementedError("This method is not implemented in the base Consolidator class.") - - def validate(self, fix_errors=False) -> list[str]: - """Validate the Consolidator's state against the expected structure""" - - # Initialize adapter from uris and determine the structure - adapter_class = DEFAULT_ADAPTERS_BY_MIMETYPE[self.mimetype] - uris = [asset.data_uri for asset in self.assets] - structure = adapter_class.from_uris(*uris, **self.adapter_parameters()).structure() - notes = [] - - if self.shape != structure.shape: - if not fix_errors: - raise ValueError(f"Shape mismatch: {self.shape} != {structure.shape}") - else: - msg = f"Fixed shape mismatch: {self.shape} -> {structure.shape}" - warnings.warn(msg, stacklevel=2) - if self.join_method == "stack": - self._num_rows = structure.shape[0] - self.datum_shape = structure.shape[1:] - elif self.join_method == "concat": - # Estimate the number of frames_per_event (multiplier) - multiplier = 1 if structure.shape[0] % structure.chunks[0][0] else structure.chunks[0][0] - self._num_rows = structure.shape[0] // multiplier - self.datum_shape = (multiplier,) + structure.shape[1:] - notes.append(msg) - - if self.chunks != structure.chunks: - if not fix_errors: - raise ValueError(f"Chunk shape mismatch: {self.chunks} != {structure.chunks}") - else: - _chunk_shape = tuple(c[0] for c in structure.chunks) - msg = f"Fixed chunk shape mismatch: {self.chunk_shape} -> {_chunk_shape}" - warnings.warn(msg, stacklevel=2) - self.chunk_shape = _chunk_shape - notes.append(msg) - - if self.data_type != structure.data_type: - if not fix_errors: - raise ValueError(f"dtype mismatch: {self.data_type} != {structure.data_type}") - else: - msg = ( - f"Fixed dtype mismatch: {self.data_type.to_numpy_dtype()} " - f"-> {structure.data_type.to_numpy_dtype()}" - ) - warnings.warn(msg, stacklevel=2) - self.data_type = structure.data_type - notes.append(msg) - - if self.dims and (len(self.dims) != len(structure.shape)): - if not fix_errors: - raise ValueError( - f"Number of dimension names mismatch for a " - f"{len(structure.shape)}-dimensional array: {self.dims}" - ) - else: - old_dims = self.dims - if len(old_dims) < len(structure.shape): - self.dims = ( - ("time",) - + old_dims - + tuple(f"dim{i}" for i in range(len(old_dims) + 1, len(structure.shape))) - ) - else: - self.dims = old_dims[: len(structure.shape)] - msg = f"Fixed dimension names: {old_dims} -> {self.dims}" - warnings.warn(msg, stacklevel=2) - notes.append(msg) - - assert self.init_adapter() is not None, "Adapter can not be initialized" - - return notes - - def get_adapter(self, adapters_by_mimetype=None): - warnings.warn( - f"{self.__class__.__name__}.get_adapter is deprecated and will be removed in a future release; " - f"please, use {self.__class__.__name__}.init_adapter instead.", - DeprecationWarning, - stacklevel=2, - ) - adapter_class = (adapters_by_mimetype or {}).get(self.mimetype) - return self.init_adapter(adapter_class=adapter_class) - - -class CSVConsolidator(ConsolidatorBase): - supported_mimetypes: set[str] = {"text/csv;header=absent"} - join_method: Literal["stack", "concat"] = "concat" - join_chunks: bool = False - - def adapter_parameters(self) -> dict: - allowed_keys = { - "comment", - "delimiter", - "dtype", - "encoding", - "header", - "names", - "nrows", - "sep", - "skipfooter", - "skiprows", - "usecols", - } - return {k: v for k, v in {"header": None, **self._sres_parameters}.items() if k in allowed_keys} - - -class HDF5Consolidator(ConsolidatorBase): - supported_mimetypes = {"application/x-hdf5"} - - def adapter_parameters(self) -> dict: - """Parameters to be passed to the HDF5 adapter, a dictionary with the keys: - - dataset: list[str] - a path to the dataset within the hdf5 file represented as list split at `/` - swmr: bool -- True to enable the single writer / multiple readers regime - """ - params = {"dataset": self._sres_parameters["dataset"]} - if slice := self._sres_parameters.get("slice", False): - params["slice"] = slice - if squeeze := self._sres_parameters.get("squeeze", False): - params["squeeze"] = squeeze - - params["swmr"] = self._sres_parameters.get("swmr", True) - params["locking"] = self._sres_parameters.get("locking", None) - - return params - - def update_from_stream_resource(self, stream_resource: StreamResource): - """Add an Asset for a new StreamResource document""" - if stream_resource["parameters"]["dataset"] != self._sres_parameters["dataset"]: - raise ValueError("All StreamResource documents must have the same dataset path.") - if stream_resource["parameters"].get("chunk_shape", ()) != self._sres_parameters.get("chunk_shape", ()): - raise ValueError("All StreamResource documents must have the same chunk shape.") - - asset = Asset( - data_uri=stream_resource["uri"], is_directory=False, parameter="data_uris", num=len(self.assets) - ) - self.assets.append(asset) - - -class MultipartRelatedConsolidator(ConsolidatorBase): - def __init__( - self, permitted_extensions: set[str], stream_resource: StreamResource, descriptor: EventDescriptor - ): - super().__init__(stream_resource, descriptor) - self.permitted_extensions: set[str] = permitted_extensions - self.assets.clear() # Assets will be populated based on datum indices - self.data_uris: list[str] = [] - self.chunk_shape = self.chunk_shape or (1,) # I.e. number of frames per file (tiff, jpeg, etc.) - if self.join_method == "concat": - assert self.datum_shape[0] % self.chunk_shape[0] == 0, ( - f"Number of frames per file ({self.chunk_shape[0]}) must divide the total number of frames per " - f"datum ({self.datum_shape[0]}): variable-sized files are not allowed." - ) - - def int_replacer(match): - """Normalize filename template - - Replace an integer format specifier with a new-style format specifier, i.e. convert the template string - from "old" to "new" Python style, e.g. "%s%s_%06d.tif" to "filename_{:06d}.tif" - - """ - flags, width, precision, type_char = match.groups() - - # Handle the flags - flag_str = "" - if "-" in flags: - flag_str = "<" # Left-align - if "+" in flags: - flag_str += "+" # Show positive sign - elif " " in flags: - flag_str += " " # Space before positive numbers - if "0" in flags: - flag_str += "0" # Zero padding - - # Build width and precision if they exist - width_str = width if width else "" - precision_str = f".{precision}" if precision else "" - - # Handle cases like "%6.6d", which should be converted to "{:06d}" - if precision and width: - flag_str = "0" - precision_str = "" - width_str = str(max(precision, width)) - - # Construct the new-style format specifier - return f"{{:{flag_str}{width_str}{precision_str}{type_char}}}" - - self.template = ( - self._sres_parameters["template"] - .replace("%s", "{:s}", 1) - .replace("%s", "") - .replace("{:s}", self._sres_parameters.get("filename", ""), 1) - ) - self.template = re.sub(r"%([-+#0 ]*)(\d+)?(?:\.(\d+))?([d])", int_replacer, self.template) - - def get_datum_uri(self, indx: int): - """Return a full uri for a datum (an individual image file) based on its index in the sequence. - - This relies on the `template` parameter passed in the StreamResource, which is a string in the "new" - Python formatting style that can be evaluated to a file name using the `.format(indx)` method given an - integer index, e.g. "{:05d}.ext". - - If template is not set, we assume that the uri is provided directly in the StreamResource document (i.e. - a single file case), and return it as is. - """ - - if self.template: - assert os.path.splitext(self.template)[1] in self.permitted_extensions - return self.uri + self.template.format(indx) - else: - return self.uri - - def consume_stream_datum(self, doc: StreamDatum): - """Determine the number and names of files from indices of datums and the number of files per datum. - - In the most general case, each file may be a multipage tiff or a stack of images (frames) and a single - datum may be composed of multiple such files, leading to a total of self.datum_shape[0] frames. - Since each file necessarily represents a single chunk (tiffs can not be sub-chunked), the number of - frames per file is equal to the leftmost chunk_shape dimension, self.chunk_shape[0]. - The number of files produced per each datum is then the ratio of these two numbers. - - If `join_method == "stack"`, we assume that each datum becomes its own index in the new leftmost dimension - of the resulting dataset, and hence corresponds to a single file. - """ - - files_per_datum = self.datum_shape[0] // self.chunk_shape[0] if self.join_method == "concat" else 1 - first_file_indx = doc["indices"]["start"] * files_per_datum - last_file_indx = doc["indices"]["stop"] * files_per_datum - for indx in range(first_file_indx, last_file_indx): - new_datum_uri = self.get_datum_uri(indx) - new_asset = Asset( - data_uri=new_datum_uri, - is_directory=False, - parameter="data_uris", - num=len(self.assets) + 1, - ) - self.assets.append(new_asset) - self.data_uris.append(new_datum_uri) - - return super().consume_stream_datum(doc) - - -class TIFFConsolidator(MultipartRelatedConsolidator): - supported_mimetypes = {"multipart/related;type=image/tiff"} - - def __init__(self, stream_resource: StreamResource, descriptor: EventDescriptor): - super().__init__({".tif", ".tiff"}, stream_resource, descriptor) - - -class JPEGConsolidator(MultipartRelatedConsolidator): - supported_mimetypes = {"multipart/related;type=image/jpeg"} - - def __init__(self, stream_resource: StreamResource, descriptor: EventDescriptor): - super().__init__({".jpeg", ".jpg"}, stream_resource, descriptor) - - -class NPYConsolidator(MultipartRelatedConsolidator): - supported_mimetypes = {"multipart/related;type=application/x-npy"} - join_method: Literal["stack", "concat"] = "stack" - - # NOTE: NPYConsolidator is tailored for tests in databroker with ophyd.sim devices. - # Use with caution in other settings! - - def __init__(self, stream_resource: StreamResource, descriptor: EventDescriptor): - # Unlike other image sequence formats (e.g. TIFF) the filename - # template is hard-coded in the NPY_SEQ handler. We inject it - # here so that the rest of the processing can be handled - # generically by ConsolidatorBase. - stream_resource["parameters"]["template"] = "%s_%d.npy" - data_key = stream_resource["data_key"] - datum_shape = descriptor["data_keys"][data_key]["shape"] - stream_resource["parameters"]["chunk_shape"] = (1, *datum_shape) - super().__init__({".npy"}, stream_resource, descriptor) - - -CONSOLIDATOR_REGISTRY = collections.defaultdict( - lambda: ConsolidatorBase, - { - "text/csv;header=absent": CSVConsolidator, - "application/x-hdf5": HDF5Consolidator, - "multipart/related;type=image/tiff": TIFFConsolidator, - "multipart/related;type=image/jpeg": JPEGConsolidator, - "multipart/related;type=application/x-npy": NPYConsolidator, - }, -) - - -def consolidator_factory(stream_resource_doc, descriptor_doc): - consolidator_class = CONSOLIDATOR_REGISTRY[stream_resource_doc["mimetype"]] - return consolidator_class(stream_resource_doc, descriptor_doc) diff --git a/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/tiled_writer.py b/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/tiled_writer.py deleted file mode 100644 index b243fbc52..000000000 --- a/bluesky-tiled-plugins/bluesky_tiled_plugins/writing/tiled_writer.py +++ /dev/null @@ -1,927 +0,0 @@ -import copy -import itertools -import logging -from collections import defaultdict, deque, namedtuple -from pathlib import Path -from typing import Any, Callable, Optional, Union, cast -from warnings import warn - -import numpy -import pyarrow -from event_model import ( - DocumentNames, - DocumentRouter, - RunRouter, - schema_validators, - unpack_datum_page, - unpack_event_page, -) -from event_model.documents import ( - Datum, - DatumPage, - DocumentType, - Event, - EventDescriptor, - EventPage, - Resource, - RunStart, - RunStop, - StreamDatum, - StreamResource, -) -from event_model.documents.event_descriptor import DataKey -from event_model.documents.stream_datum import StreamRange -from tiled.client import from_profile, from_uri -from tiled.client.array import ArrayClient -from tiled.client.base import BaseClient -from tiled.client.container import Container -from tiled.client.dataframe import DataFrameClient -from tiled.client.utils import handle_error -from tiled.structures.core import Spec -from tiled.utils import safe_json_dump - -from ..utils import truncate_json_overflow -from ._dispatcher import Dispatcher -from ._json_writer import JSONLinesWriter -from .consolidators import ConsolidatorBase, DataSource, Patch, StructureFamily, consolidator_factory - -# Aggregate the Event table rows and StreamDatums in batches before writing to Tiled -BATCH_SIZE = 10000 - -# Maximum size of internal arrays from Event docs to write to tabular (SQL) storage; larger arrays will be written -# as zarr. Set to 0 to write all internal arrays as zarr, and -1 to write all internal arrays to tabular storage. -MAX_ARRAY_SIZE = 16 - -# Disallow using reserved words as data_keys identifiers -# Related: https://github.com/bluesky/event-model/pull/223 -RESERVED_DATA_KEYS = ["time", "seq_num"] - -# A lookup table for converting broad JSON types to numpy dtypes -JSON_TO_NUMPY_DTYPE = {"number": " 1: - raise ValueError("All StreamDatum documents must reference the same descriptor.") - if len({doc["stream_resource"] for doc in docs}) > 1: - raise ValueError("All StreamDatum documents must reference the same stream_resource.") - docs = tuple(sorted(docs, key=lambda doc: doc["indices"]["start"])) - for d1, d2 in zip(docs[:-1], docs[1:]): # TODO: use itertools.pairwise(docs) in python 3.10+ - if d1["indices"]["stop"] != d2["indices"]["start"]: - raise ValueError("StreamDatum documents must be consecutive.") - - return StreamDatum( - uid=docs[-1]["uid"], - stream_resource=docs[-1]["stream_resource"], - descriptor=docs[-1]["descriptor"], - indices=StreamRange(start=docs[0]["indices"]["start"], stop=docs[-1]["indices"]["stop"]), - seq_nums=StreamRange(start=docs[0]["seq_nums"]["start"], stop=docs[-1]["seq_nums"]["stop"]), - ) - - -# A named tuple to cache references to external data from Event documents. -ExternalEventDataReference = namedtuple( - "ExternalEventDataReference", - [ - "datum_id", # The UID of the Datum document that references this external data - "data_key", # The data_key of the external data - "desc_uid", # The UID of the EventDescriptor document that this datum belongs to - "seq_num", # The sequence number of the Event document - ], -) - - -class _ConditionalBackup: - """Callback that tries to call the primary callback and, if it fails, flushes the buffer to backup callbacks. - - Once an error has been encountererd in the primary callback, all subsequent documents would be sent to the - backup callbacks as well. - - This callback is intended to be used with a `RunRouter` and process documents from a single Bluesky run. - """ - - def __init__(self, primary_callback: Callable, backup_callbacks: list[Callable], maxlen: int = 1_000_000): - self.primary_callback = primary_callback - self.backup_callbacks = backup_callbacks - self._buffer: deque[tuple[str, DocumentType]] = deque(maxlen=maxlen) - self._push_to_backup = False - - def __call__(self, name: str, doc: DocumentType): - self._buffer.append((name, doc)) - - try: - self.primary_callback(name, doc) - except Exception as e: - logger.warning( - f"Primary callback {type(self.primary_callback).__name__} failed: {e}. " - "Flushing buffer to backup callbacks.", - stacklevel=2, - ) - self._push_to_backup = True - - if self._push_to_backup: - for name, doc in self._buffer: - for bcb in self.backup_callbacks: - try: - bcb(name, doc) - except Exception as e: - logger.warning( - f"Backup callback {bcb.__class__.__name__} failed with error: {e}", stacklevel=2 - ) - self._buffer.clear() - - -class RunNormalizer(DocumentRouter): - """Callback for updating Bluesky documents to their latest schema. - - This callback can be used to subscribe additional consumers that require the updated documents. - Returns a shallow copy of the document to avoid modifying the original one. - - Parameters - ---------- - patches : dict[str, Callable], optional - A dictionary of patch functions to apply to the documents before modifying them. - The keys are document names (e.g., "start", "stop", "descriptor", etc.), and the values - are functions that take a document as input and return a modified document. - spec_to_mimetype : dict[str, str], optional - A dictionary mapping spec names to MIME types. This is used to convert `Resource` documents - to the latest `StreamResource` schema. - The supplied dictionary updates the default `MIMETYPE_LOOKUP` dictionary. - """ - - def __init__( - self, - patches: Optional[dict[str, Callable]] = None, - spec_to_mimetype: Optional[dict[str, str]] = None, - ): - self._token_refs: dict[str, Callable] = {} - self.dispatcher = Dispatcher() - self.patches = patches or {} - self.spec_to_mimetype = MIMETYPE_LOOKUP | (spec_to_mimetype or {}) - - self._next_frame_index: dict[tuple[str, str], dict[str, int]] = defaultdict( - lambda: {"carry": 0, "index": 0} - ) - self._datum_cache: dict[str, Datum] = {} - self._ext_ref_cache: list[ExternalEventDataReference] = [] # Cache for references to external Event data - self._desc_name_by_uid: dict[str, str] = {} - self._sres_cache: dict[str, StreamResource] = {} - self._emitted: set[str] = set() # UIDs of the StreamResource documents that have been emitted - self._int_keys: set[str] = set() # Names of internal data_keys - self._ext_keys: set[str] = set() - self._specs_by_resource_uid = {} # Keep track of spec by Resource uid, used to enrich datum_kwargs - self.notes: list[str] = [] # Human-readable notes about modifications made to the documents - - def _convert_resource_to_stream_resource(self, doc: Union[Resource, StreamResource]) -> StreamResource: - """Make changes to and return a shallow copy of StreamRsource dictionary adhering to the new structure. - - Kept for back-compatibility with old StreamResource schema from event_model<1.20.0 - or Resource documents that are converted to StreamResources. - """ - stream_resource_doc = cast(StreamResource, doc) - - if "mimetype" not in doc: - # The document is a `Resource` or a < v1.20 `StreamResource`. - # Both are converted to latest version `StreamResource`. - for expected_key in ("spec", "root", "resource_path", "resource_kwargs"): - if expected_key not in doc: - raise RuntimeError( - f"`Resource` or `StreamResource` legacy document is missing a '{expected_key}'" - ) - - # Convert the Resource (or old StreamResource) document to a StreamResource document - resource_dict = cast(dict, doc) - stream_resource_doc["mimetype"] = self.spec_to_mimetype[resource_dict.pop("spec")] - stream_resource_doc["parameters"] = resource_dict.pop("resource_kwargs", {}) - file_path = Path(resource_dict.pop("root").strip("/")).joinpath( - resource_dict.pop("resource_path").strip("/") - ) - stream_resource_doc["uri"] = "file://localhost/" + str(file_path).lstrip("/") - - # Ensure that the internal path within HDF5 files is referenced with "dataset" parameter - if stream_resource_doc["mimetype"] == "application/x-hdf5": - stream_resource_doc["parameters"]["dataset"] = stream_resource_doc["parameters"].pop( - "path", stream_resource_doc["parameters"].pop("dataset", "") - ) - - # Ensure that only the necessary fields are present in the StreamResource document - stream_resource_doc["data_key"] = stream_resource_doc.get("data_key", "") - required_keys = {"data_key", "mimetype", "parameters", "uid", "uri"} - for key in set(stream_resource_doc.keys()).difference(required_keys): - stream_resource_doc.pop(key) # type: ignore - - return stream_resource_doc - - def _convert_datum_to_stream_datum( - self, datum_doc: Datum, data_key: str, desc_uid: str, seq_num: int - ) -> tuple[Optional[StreamResource], StreamDatum]: - """Convert the Datum document to the StreamDatum format - - This conversion requires (and is triggered when) the Event document is received. The function also returns - a corresponding StreamResource document, if it hasn't been emitted yet. - - Parameters - ---------- - datum_doc : Datum - The Datum document to convert. - data_key : str - The data_key of the external data in the Event document; this parameter must be included in the new - StreamResource document. - desc_uid : str - The UID of the EventDescriptor document that this datum belongs to. - seq_num : int - The sequence number of the Event document that this datum belongs to; 1-base index. - - Returns - ------- - sres_doc : StreamResource, optional - The corresponding StreamResource document, if it hasn't been emitted yet, otehrwise -- None. - sdat_doc : StreamDatum - The StreamDatum document corresponding to the Datum document. - """ - - # Some Datums contain datum_kwargs and the 'frame' field, which indicates the last index of the - # frame. This should take precedence over the 'seq_num' field in the Event document. Keep the - # last frame index in memory, since next Datums may refer to more than one frame (it is - # assumed that Events always refer to a single frame). - # There are cases when the frame_index is reset during the scan (e.g. if Datums for the same - # data_key belong to different Resources), so the 'carry' field is used to keep track of the - # previous frame index. - datum_kwargs = datum_doc.get("datum_kwargs", {}) - frame = datum_kwargs.pop("frame", None) - if frame is not None: - desc_name = self._desc_name_by_uid[desc_uid] # Name of the descriptor (stream) - _next_index = self._next_frame_index[(desc_name, data_key)] - index_start = sum(_next_index.values()) - _next_index["index"] = frame + 1 - index_stop = sum(_next_index.values()) - if index_stop < index_start: - # The datum is likely referencing a next Resource, but the indexing must continue - _next_index["carry"] = index_start - index_stop = sum(_next_index.values()) - else: - index_start, index_stop = seq_num - 1, seq_num - indices = StreamRange(start=index_start, stop=index_stop) - seq_nums = StreamRange(start=index_start + 1, stop=index_stop + 1) - - # produce the Resource document, if needed (add data_key to match the StreamResource schema) - # Emit a copy of the StreamResource document with a new uid; this allows to account for cases - # where one Resource is used by several data streams with different data_keys and datum_kwargs. - sres_doc = None - sres_uid = datum_doc["resource"] - new_sres_uid = sres_uid + "-" + data_key - if (sres_uid in self._sres_cache) and (new_sres_uid not in self._emitted): - sres_doc = copy.deepcopy(self._sres_cache[sres_uid]) - sres_doc["data_key"] = data_key - sres_doc["parameters"].update(datum_kwargs) - sres_doc["uid"] = new_sres_uid - - # Produce the StreamDatum document - sdat_doc = StreamDatum( - uid=datum_doc["datum_id"], - stream_resource=new_sres_uid, - descriptor=desc_uid, - indices=indices, - seq_nums=seq_nums, - ) - - return sres_doc, sdat_doc - - def start(self, doc: RunStart): - doc = copy.copy(doc) - if patch := self.patches.get("start"): - doc = patch(doc) - self.emit(DocumentNames.start, doc) - - def stop(self, doc: RunStop): - doc = copy.copy(doc) - if patch := self.patches.get("stop"): - doc = patch(doc) - - # If there are any cached references to external data, emit StreamResources and StreamDatums now - for datum_id, data_key, desc_uid, seq_num in self._ext_ref_cache: - if datum_doc := self._datum_cache.pop(datum_id, None): - sres_doc, sdat_doc = self._convert_datum_to_stream_datum(datum_doc, data_key, desc_uid, seq_num) - if (sres_doc is not None) and (sres_doc["uid"] not in self._emitted): - self.emit(DocumentNames.stream_resource, sres_doc) - self._emitted.add(sres_doc["uid"]) - self.emit(DocumentNames.stream_datum, sdat_doc) - else: - raise RuntimeError( - f"Cannot emit StreamDatum for {data_key} because the corresponding Datum document is missing." - ) - - doc["_run_normalizer_notes"] = self.notes or [] # Add notes about modifications to the stop document - - self.emit(DocumentNames.stop, doc) - - def descriptor(self, doc: EventDescriptor): - doc = copy.deepcopy(doc) - if patch := self.patches.get("descriptor"): - doc = patch(doc) - - # Rename data_keys that use reserved words, "time" and "seq_num" - for name in RESERVED_DATA_KEYS: - if name in doc["data_keys"].keys(): - if f"_{name}" in doc["data_keys"].keys(): - raise ValueError(f"Cannot rename {name} to _{name} because it already exists") - doc["data_keys"][f"_{name}"] = doc["data_keys"].pop(name) - for obj_data_keys_list in doc.get("object_keys", {}).values(): - if name in obj_data_keys_list: - obj_data_keys_list.remove(name) - obj_data_keys_list.append(f"_{name}") - - # Rename some fields (in-place) to match the current schema for the descriptor - # Loop over all dictionaries that specify data_keys (both event data_keys or configuration data_keys) - conf_data_keys = (obj["data_keys"].values() for obj in doc.get("configuration", {}).values()) - for data_keys_spec in itertools.chain(doc["data_keys"].values(), *conf_data_keys): - # Determine numpy data type. From highest precedent to lowest: - # 1. Try 'dtype_descr', optional, if present -- this is a structural dtype - # 2. Try 'dtype_numpy', optional in the document schema. - # 3. Try 'dtype_str', an old convention predataing 'dtype_numpy', not in the schema. - # 4. Get 'dtype', required by the schema, which is a fuzzy JSON spec like 'number' - # and make a best effort to convert it to a numpy spec like '= self._batch_size: - self._write_internal_data(data_cache, desc_node=self._desc_nodes[desc_uid]) - data_cache.clear() - - def event_page(self, doc: EventPage): - for _doc in unpack_event_page(doc): - self.event(_doc) - - def stream_resource(self, doc: StreamResource): - self._stream_resource_cache[doc["uid"]] = doc - - def get_sres_node(self, sres_uid: str, desc_uid: Optional[str] = None) -> tuple[BaseClient, ConsolidatorBase]: - """Get the Tiled node and the associate Consolidator corresponding to the data_key in StreamResource - - If the node does not exist, register it from a cached StreamResource document. Keep a reference to the - node and the corresponding Consolidator object. If the node already exists, return the existing one. - - The nodes and consolidators are referenced by both: - - sres_uid: the uid of the StreamResource document - - desc_name + data_key: the name of the descriptor (stream) and the data_key - """ - - if sres_uid in self._sres_nodes.keys(): - sres_node = self._sres_nodes[sres_uid] - consolidator = self._consolidators[sres_uid] - - elif sres_uid in self._stream_resource_cache.keys(): - if not desc_uid: - raise RuntimeError("Descriptor uid must be specified to initialize a Stream Resource node") - - # Define `full_data_key` as desc_name + _ + data_key to ensure uniqueness across streams - sres_doc = self._stream_resource_cache[sres_uid] - desc_node = self._desc_nodes[desc_uid] - full_data_key = f"{desc_node.item['id']}_{sres_doc['data_key']}" # desc_name + data_key - - # Check if there already exists a Node and a Consolidator for this data_key - # i.e. this is an additional StreamResource, whose data should be concatenated with the existing one - if full_data_key in self._sres_nodes.keys(): - sres_node = self._sres_nodes[full_data_key] - consolidator = self._consolidators[full_data_key] - consolidator.update_from_stream_resource(sres_doc) - else: - consolidator = consolidator_factory(sres_doc, desc_node.metadata) - sres_node = desc_node.new( - key=consolidator.data_key, - structure_family=StructureFamily.array, - data_sources=[consolidator.get_data_source()], - metadata={}, - specs=[], - access_tags=self.access_tags, - ) - - self._consolidators[sres_uid] = self._consolidators[full_data_key] = consolidator - self._sres_nodes[sres_uid] = self._sres_nodes[full_data_key] = sres_node - else: - raise RuntimeError(f"Stream Resource {sres_uid} is referenced before being received.") - - return sres_node, consolidator - - def stream_datum(self, doc: StreamDatum): - if self._batch_size <= 1: - # If batch size is 1, write the StreamDatum immediately - self._write_external_data(doc) - return - - # Try to concatenate and cache the StreamDatum document to process it later - sres_uid = doc["stream_resource"] - if cached_stream_datum_doc := self._external_data_cache.pop(sres_uid, None): - try: - _doc = concatenate_stream_datums(cached_stream_datum_doc, doc) - if _doc["indices"]["stop"] - _doc["indices"]["start"] >= self._batch_size: - self._write_external_data(_doc) - else: - self._external_data_cache[sres_uid] = _doc - except ValueError: - # If concatenation fails, write the cached document and then the new one immediately - self._write_external_data(cached_stream_datum_doc) - self._write_external_data(doc) - else: - self._external_data_cache[sres_uid] = doc - - -class TiledWriter: - """Callback for write metadata and data from Bluesky documents into Tiled. - - This callback relies on the `RunRouter` to route documents from one or more runs into - independent instances of the `_RunWriter` callback. The `RunRouter` is responsible for - creating a new instance of the `_RunWriter` for each run. - - Parameters - ---------- - client : `tiled.client.BaseClient` - The Tiled client to use for writing data. This client must be initialized with - the appropriate credentials and connection parameters to access the Tiled server. - normalizer : Optional[DocumentRouter] - A callback for normalizing Bluesky documents to the latest schema. If not provided, - the default `RunNormalizer` will be used. The supplied normalizer should accept - `patches` and `spec_to_mimetype` (or `**kwargs`) for initialization. - To disable normalization and pass the incoming document directly to _RunWriter, - set this parameter to `None`. - patches : Optional[dict[str, Callable]] - A dictionary of patch functions to apply to specific document types before normalizing - and writing them. The keys should be the document names (e.g., "start", "stop", - "descriptor", etc.), and the values should be functions that take a document and return - a modified document of the same type. - This argument is ignored if `normalizer` is set to `None`. - spec_to_mimetype : Optional[dict[str, str]] - A dictionary mapping spec names to MIME types. This is used to convert `Resource` documents - to the latest `StreamResource` schema. If not provided, the default mapping will be used. - This argument is ignored if `normalizer` is set to `None`. - backup_directory : Optional[str] - If specified, this directory will be used to back up runs that fail to be written - to Tiled. All documents for the entire Bluesky Run will be written in JSONLines format, - allowing for recovery in case of errors during the writing process. - batch_size : int - The number of Events or StreamDatums collect before writing them to Tiled. - This is useful for reducing the number of write operations and improving performance when - writing large amounts of data (e.g. database migration). For streaming applications, - it is recommended to set this parameter to <= 1, so that each Event or StreamDatum is written - to Tiled immediately after they are received. - """ - - def __init__( - self, - client: BaseClient, - *, - normalizer: Optional[type[DocumentRouter]] = RunNormalizer, - patches: Optional[dict[str, Callable]] = None, - spec_to_mimetype: Optional[dict[str, str]] = None, - backup_directory: Optional[str] = None, - batch_size: int = BATCH_SIZE, - max_array_size: int = MAX_ARRAY_SIZE, - ): - self.client = client.include_data_sources() - self.patches = patches or {} - self.spec_to_mimetype = spec_to_mimetype or {} - self.backup_directory = backup_directory - self._normalizer = normalizer - self._run_router = RunRouter([self._factory]) - self._batch_size = batch_size - self._max_array_size = max_array_size - - def _factory(self, name, doc): - """Factory method to create a callback for writing a single run into Tiled.""" - cb = run_writer = _RunWriter(self.client, batch_size=self._batch_size, max_array_size=self._max_array_size) - - if self._normalizer: - # If normalize is True, create a RunNormalizer callback to update documents to the latest schema - cb = self._normalizer(patches=self.patches, spec_to_mimetype=self.spec_to_mimetype) - cb.subscribe(run_writer) - - if self.backup_directory: - # If backup_directory is specified, create a conditional backup callback writing documents to JSONLines - cb = _ConditionalBackup(cb, [JSONLinesWriter(self.backup_directory)]) - - return [cb], [] - - @classmethod - def from_uri( - cls, - uri, - *, - normalizer: Optional[type[DocumentRouter]] = RunNormalizer, - patches: Optional[dict[str, Callable]] = None, - spec_to_mimetype: Optional[dict[str, str]] = None, - backup_directory: Optional[str] = None, - batch_size: int = BATCH_SIZE, - **kwargs, - ): - client = from_uri(uri, **kwargs) - return cls( - client, - normalizer=normalizer, - patches=patches, - spec_to_mimetype=spec_to_mimetype, - backup_directory=backup_directory, - batch_size=batch_size, - ) - - @classmethod - def from_profile( - cls, - profile, - *, - normalizer: Optional[type[DocumentRouter]] = RunNormalizer, - patches: Optional[dict[str, Callable]] = None, - spec_to_mimetype: Optional[dict[str, str]] = None, - backup_directory: Optional[str] = None, - batch_size: int = BATCH_SIZE, - **kwargs, - ): - client = from_profile(profile, **kwargs) - return cls( - client, - normalizer=normalizer, - patches=patches, - spec_to_mimetype=spec_to_mimetype, - backup_directory=backup_directory, - batch_size=batch_size, - ) - - def __call__(self, name, doc): - self._run_router(name, doc) diff --git a/bluesky-tiled-plugins/pyproject.toml b/bluesky-tiled-plugins/pyproject.toml deleted file mode 100644 index c6c17bbeb..000000000 --- a/bluesky-tiled-plugins/pyproject.toml +++ /dev/null @@ -1,50 +0,0 @@ -[build-system] -requires = ["hatchling", "hatch-vcs"] -build-backend = "hatchling.build" - -[project] -name = "bluesky-tiled-plugins" -description = "Tiled client plugins to provide an customized user experience for Bluesky data in Tiled" -readme = { file = "README.md", content-type = "text/markdown" } -authors = [ - { name = "Bluesky Project Contributors", email = "dallan@bnl.gov" }, -] -maintainers = [ - { name = "Brookhaven National Laboratory", email = "dallan@bnl.gov" }, -] -requires-python = ">=3.9" - -dependencies = [ - "dask", - "event-model", - "mongoquery", - "pytz", - "tiled[client] >=0.2.0", - "tzlocal", -] - -classifiers = [ - "Development Status :: 4 - Beta", - "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Topic :: Scientific/Engineering :: Physics", -] - -dynamic = ["version"] - -[project.entry-points."tiled.special_client"] -CatalogOfBlueskyRuns = "bluesky_tiled_plugins.clients.catalog_of_bluesky_runs:CatalogOfBlueskyRuns" -BlueskyRun = "bluesky_tiled_plugins.clients.bluesky_run:BlueskyRun" -BlueskyEventStream = "bluesky_tiled_plugins.clients.bluesky_event_stream:BlueskyEventStream" - -[tool.hatch] -version.source = "vcs" -version.raw-options = { root = ".." } -version.fallback-version = "0.0.0" -build.hooks.vcs.version-file = "bluesky_tiled_plugins/_version.py" diff --git a/bluesky-tiled-plugins/tests/conftest.py b/bluesky-tiled-plugins/tests/conftest.py deleted file mode 100644 index 17dc520f4..000000000 --- a/bluesky-tiled-plugins/tests/conftest.py +++ /dev/null @@ -1,24 +0,0 @@ -import asyncio - -import pytest -from bluesky.run_engine import RunEngine, TransitionError - - -@pytest.fixture(scope="function", params=[False, True]) -def RE(request): - loop = asyncio.new_event_loop() - loop.set_debug(True) - RE = RunEngine({}, call_returns_result=request.param, loop=loop) - - def clean_event_loop(): - if RE.state not in ("idle", "panicked"): - try: - RE.halt() - except TransitionError: - pass - loop.call_soon_threadsafe(loop.stop) - RE._th.join() - loop.close() - - request.addfinalizer(clean_event_loop) - return RE diff --git a/bluesky-tiled-plugins/tests/examples/external_assets.json b/bluesky-tiled-plugins/tests/examples/external_assets.json deleted file mode 100644 index 4e0dbbd31..000000000 --- a/bluesky-tiled-plugins/tests/examples/external_assets.json +++ /dev/null @@ -1,351 +0,0 @@ -[ - { - "name": "start", - "doc": { - "uid": "{{ uuid }}-9724b2201fe7", - "time": 1745500521.706236, - "scan_id": 3, - "plan_type": "generator", - "plan_name": "count", - "detectors": [ - "det-obj1", - "det-obj2" - ] - } - }, - { - "name": "descriptor", - "doc": { - "configuration": { - "det-obj1": { - "data": {}, - "timestamps": {}, - "data_keys": {} - } - }, - "data_keys": { - "det-key1": { - "source": "file", - "dtype": "number", - "dtype_numpy": " None: - self.name = name - self.root = root - self.counter = 0 - - -class StreamDatumReadableCollectable(Named, Readable, Collectable, WritesStreamAssets): - """Produces no events, but only StreamResources/StreamDatums and can be read or collected""" - - def _get_hdf5_stream(self, data_key: str, index: int) -> tuple[Optional[StreamResource], StreamDatum]: - file_path = os.path.join(self.root, "dataset.h5") - uid = f"{data_key}-uid" - data_desc = self.describe()[data_key] # Descriptor dictionary for the current data key - data_shape = cast(tuple[int, ...], tuple(data_desc["shape"])) - hdf5_dataset = f"/{data_key}/VALUE" - - stream_resource = None - if self.counter == 0: - # Backward compatibility test, ignore typing errors - stream_resource = StreamResource( # type: ignore[typeddict-unknown-key] - parameters={"dataset": hdf5_dataset, "chunk_shape": (100, *data_shape[1:]), "_validate": True}, - data_key=data_key, - root=self.root, - resource_path="/dataset.h5", - uri="file://localhost/" + file_path, - spec="AD_HDF5_SWMR_STREAM", - mimetype="application/x-hdf5", - uid=uid, - ) - # Initialize an empty HDF5 dataset (3D: var 1 dim, fixed 2 and 3 dims) - with h5py.File(file_path, "a") as f: - dset = f.require_dataset( - hdf5_dataset, - data_shape, - maxshape=(None, *data_shape[1:]), - dtype=np.dtype("float64"), - chunks=(100, *data_shape[1:]), - ) - - indx_min, indx_max = self.counter, self.counter + index - stream_datum = StreamDatum( - stream_resource=uid, - descriptor="", - uid=f"{uid}/{self.counter}", - indices={"start": indx_min, "stop": indx_max}, - seq_nums={"start": 0, "stop": 0}, # seq_nums will be overwritten by RunBundler - ) - - # Write (append to) the hdf5 dataset - with h5py.File(file_path, "a") as f: - dset = f[hdf5_dataset] - dset.resize([indx_max * data_shape[0], *data_shape[1:]]) - dset[indx_min * data_shape[0] : indx_max * data_shape[0], ...] = np.random.randn( # noqa: E203 - (indx_max - indx_min) * data_shape[0], *data_shape[1:] - ) - - return stream_resource, stream_datum - - def _get_tiff_stream(self, data_key: str, index: int) -> tuple[Optional[StreamResource], StreamDatum]: - file_path = self.root - for data_key in [f"{self.name}-sd3"]: - uid = f"{data_key}-uid" - data_desc = self.describe()[data_key] # Descriptor dictionary for the current data key - data_shape = cast(tuple[int, ...], tuple(data_desc["shape"])) - stream_resource = None - if self.counter == 0: - # Backward compatibility test, ignore typing errors - stream_resource = StreamResource( # type: ignore[typeddict-unknown-key] - parameters={ - "chunk_shape": (1, *data_shape), - "template": "{:05d}.tif", - "join_method": "stack", - "_validate": True, - }, - data_key=data_key, - root=self.root, - uri="file://localhost/" + self.root + "/", - spec="AD_TIFF", - mimetype="multipart/related;type=image/tiff", - uid=uid, - ) - - indx_min, indx_max = self.counter, self.counter + index - stream_datum = StreamDatum( - stream_resource=uid, - descriptor="", - uid=f"{uid}/{self.counter}", - indices={"start": indx_min, "stop": indx_max}, - seq_nums={"start": 0, "stop": 0}, # seq_nums will be overwritten by RunBundler - ) - - # Write a tiff file - data = np.random.randint(0, 255, data_shape, dtype="uint8") - tf.imwrite(os.path.join(file_path, f"{self.counter:05}.tif"), data) - - return stream_resource, stream_datum - - def describe(self) -> dict[str, DataKey]: - """Describe datasets which will be backed by StreamResources""" - return { - # Numerical data with 1 number per event in hdf5 format - f"{self.name}-sd1": DataKey( - source="file", - dtype="number", - dtype_numpy=np.dtype("float64").str, - shape=[ - 1, - ], - external="STREAM:", - ), - # 2-D data with 5 frames per event in hdf5 format - f"{self.name}-sd2": DataKey( - source="file", - dtype="array", - dtype_numpy=np.dtype("float64").str, - shape=[5, 10, 15], - external="STREAM:", - ), - # 3-D data with 10 frames per event in tiff format - f"{self.name}-sd3": DataKey( - source="file", - dtype="array", - dtype_numpy=np.dtype("uint8").str, - shape=[10, 5, 7, 4], - external="STREAM:", - ), - } - - def describe_collect(self) -> Union[dict[str, DataKey], dict[str, dict[str, DataKey]]]: - return self.describe() - - def collect_asset_docs(self, index: Optional[int] = None) -> Iterator[StreamAsset]: - """Produce a StreamResource and StreamDatum for all data keys for 0:index""" - index = index or 1 - data_keys_methods = { - f"{self.name}-sd1": self._get_hdf5_stream, - f"{self.name}-sd2": self._get_hdf5_stream, - f"{self.name}-sd3": self._get_tiff_stream, - } - - for data_key, method in data_keys_methods.items(): - stream_resource, stream_datum = method(data_key, index) - if stream_resource is not None: - yield "stream_resource", stream_resource - yield "stream_datum", stream_datum - - self.counter += index - - def get_index(self) -> int: - """Report how many frames were written""" - return self.counter - - def read(self) -> dict[str, Reading]: - """Produce an empty event""" - return {} - - -class SynSignalWithRegistry(ophyd.sim.SynSignalWithRegistry): - """A readable image detector that writes a sequence of files and generates relevant Bluesky documents. - - Subclassed from ophyd.sim to match the updated schema of Resource documents. - """ - - def __init__(self, *args, dtype_numpy="uint8", **kwargs): - self.dtype_numpy = dtype_numpy - super().__init__(*args, **kwargs) - - def stage(self): - super().stage() - parameters = { - "chunk_shape": (1,), - "template": "_{:d}." + self.save_ext, - "join_method": "stack", - "_validate": True, - } - self._asset_docs_cache[-1][1]["resource_kwargs"].update(parameters) - - def describe(self): - res = super().describe() - for key in res: - res[key]["external"] = "FILESTORE" - res[key]["dtype_numpy"] = self.dtype_numpy - return res - - -def test_stream_datum_readable_counts(RE, client, tmp_path): - tw = TiledWriter(client) - det = StreamDatumReadableCollectable(name="det", root=str(tmp_path)) - RE(bp.count([det], 3), tw) - stream = client.values().last()["primary"] - keys = sorted(set(stream.base.keys()).difference({"internal"})) - - assert stream[keys[0]].shape == (3,) - assert stream[keys[1]].shape == (15, 10, 15) - assert stream[keys[2]].shape == (3, 10, 5, 7, 4) - assert stream[keys[0]].read() is not None - assert stream[keys[1]].read() is not None - assert stream[keys[2]].read() is not None - - -def test_stream_datum_readable_with_two_detectors(RE, client, tmp_path): - det1 = StreamDatumReadableCollectable(name="det1", root=str(tmp_path)) - det2 = StreamDatumReadableCollectable(name="det2", root=str(tmp_path)) - tw = TiledWriter(client) - RE(bp.count([det1, det2], 3), tw) - stream = client.values().last()["primary"] - keys = sorted(set(stream.base.keys()).difference({"internal"})) - - assert stream[keys[0]].shape == (3,) - assert stream[keys[1]].shape == (15, 10, 15) - assert stream[keys[2]].shape == (3, 10, 5, 7, 4) - assert stream[keys[3]].shape == (3,) - assert stream[keys[4]].shape == (15, 10, 15) - assert stream[keys[5]].shape == (3, 10, 5, 7, 4) - assert stream[keys[0]].read() is not None - assert stream[keys[1]].read() is not None - assert stream[keys[2]].read() is not None - assert stream[keys[3]].read() is not None - assert stream[keys[4]].read() is not None - assert stream[keys[5]].read() is not None - - -def test_stream_datum_collectable(RE, client, tmp_path): - det = StreamDatumReadableCollectable(name="det", root=str(tmp_path)) - tw = TiledWriter(client) - RE(collect_plan(det, name="primary"), tw) - stream = client.values().last()["primary"] - keys = sorted(set(stream.base.keys()).difference({"internal"})) - - assert stream[keys[0]].read() is not None - assert stream[keys[1]].read() is not None - assert stream[keys[2]].read() is not None - - -@pytest.mark.parametrize("frames_per_event", [1, 5, 10]) -def test_handling_non_stream_resource(RE, client, tmp_path, frames_per_event): - det = SynSignalWithRegistry( - func=lambda: np.random.randint(0, 255, (frames_per_event, 10, 15), dtype="uint8"), - dtype_numpy=np.dtype("uint8").str, - name="img", - labels={"detectors"}, - save_func=tf.imwrite, - save_path=str(tmp_path), - save_spec="AD_TIFF", - save_ext="tif", - ) - tw = TiledWriter(client) - RE(bp.count([det], 3), tw) - extr = client.values().last()["primary"].base["img"] - intr = client.values().last()["primary"].base["internal"] - assert extr.shape == (3, frames_per_event, 10, 15) - assert extr.read() is not None - assert set(intr.columns) == {"seq_num", "time"} - assert len(intr.read()) == 3 - assert (intr["seq_num"].read() == [1, 2, 3]).all() - - -def collect_plan(*objs, name="primary"): - yield from bps.open_run() - yield from bps.declare_stream(*objs, collect=True, name=name) - yield from bps.collect(*objs, return_payload=False, name=name) - yield from bps.close_run() - - -@pytest.mark.parametrize("fname", ["internal_events", "external_assets", "external_assets_legacy"]) -@pytest.mark.parametrize("batch_size", [0, 1, 1000, None]) -def test_with_correct_sample_runs(client, batch_size, external_assets_folder, fname): - if batch_size is None: - tw = TiledWriter(client) - else: - tw = TiledWriter(client, batch_size=batch_size) - for item in render_templated_documents(fname + ".json", external_assets_folder): - if item["name"] == "start": - uid = item["doc"]["uid"] - tw(**item) - - run = client[uid] - - for stream in run.values(): - assert stream.read() is not None - - -def test_dims_names(client, external_assets_folder): - tw = TiledWriter(client) - - for item in render_templated_documents("external_assets.json", external_assets_folder): - if item["name"] == "start": - uid = item["doc"]["uid"] - tw(**item) - - run = client[uid] - - assert run["primary"]["det-key1"].structure().dims is None - assert run["primary"]["det-key2"].structure().dims == ("time", "dim_x", "dim_y") - - -@pytest.mark.parametrize( - "batch_size, expected_patch_shapes, expected_patch_offsets", - [(1, (1, 1, 1), (0, 1, 2)), (2, (2, 1), (0, 2)), (5, (3,), (0,))], -) -def test_data_source_patching( - client, batch_size, expected_patch_shapes, expected_patch_offsets, external_assets_folder -): - tw = TiledWriter(client, batch_size=batch_size) - - with record_history() as history: - for item in render_templated_documents("external_assets.json", external_assets_folder): - tw(**item) - - def parse_data_source_uri(uri: str): - """Given a full data_source URL, extract: - - data_key (e.g. "det-key1") - - decoded query parameters as tuples of ints - - Returns: - (data_key, params_dict) - """ - - # data_key is the last component of the path - data_key = urlparse(uri).path.rstrip("/").split("/")[-1] - - # parse query parameters and convert comma-separated values to tuples of ints - params = {} - for k, v in parse_qs(urlparse(uri).query).items(): - params[k] = tuple(map(int, v[0].split(","))) # parse_qs gives lists - - return data_key, params - - put_uri_params = [ - parse_data_source_uri(str(req.url)) - for req in history.requests - if req.method == "PUT" and "/data_source" in req.url.path - ] - - # Check that each data key received the expected number of updates - assert len(put_uri_params) == 3 * len(expected_patch_shapes) # 3 data keys in the example - for data_key in {"det-key1", "det-key2", "det-key3"}: - assert len([uri for dk, uri in put_uri_params if dk == data_key]) == len(expected_patch_shapes) - - # Check that the patch sizes and offsets (leftmost dimensions) match expectations - actual_patch_sizes = tuple(params["patch_shape"][0] for dk, params in put_uri_params if dk == data_key) - actual_patch_offsets = tuple(params["patch_offset"][0] for dk, params in put_uri_params if dk == data_key) - assert actual_patch_sizes == expected_patch_shapes - assert actual_patch_offsets == expected_patch_offsets - - -@pytest.mark.parametrize("error_type", ["shape", "chunks", "dtype"]) -@pytest.mark.parametrize("validate", [True, False]) -def test_validate_external_data(client, external_assets_folder, error_type, validate): - tw = TiledWriter(client) - - documents = render_templated_documents("external_assets_single_key.json", external_assets_folder) - for item in documents: - name, doc = item["name"], item["doc"] - if name == "start": - uid = doc["uid"] - - # Modify the document to introduce an error - if (error_type == "shape") and (name == "descriptor"): - doc["data_keys"]["det-key2"]["shape"] = [1, 2, 3] # should be [1, 13, 17] - elif (error_type == "chunks") and name in {"resource", "stream_resource"}: - doc["parameters"]["chunk_shape"] = [1, 2, 3] # should be [100, 13, 17] - elif (error_type == "dtype") and (name == "descriptor"): - doc["data_keys"]["det-key2"]["dtype_numpy"] = np.dtype("int32").str # should be "int64" - - # Add flag to trigger validation - if name in {"resource", "stream_resource"} and validate: - doc["parameters"]["_validate"] = True - - # Check that the warning is issued when data changes during the validation - if name == "stop" and validate: - with pytest.warns(UserWarning): - tw(name, doc) - else: - tw(name, doc) - - # Try reading the imported data - run = client[uid] - if not validate and not error_type == "chunks": - with pytest.raises(ValueError): - assert run["primary"].read() is not None - else: - assert run["primary"].read() is not None - assert run["primary"]["det-key2"].read().shape == (8, 13, 17) - - -@pytest.mark.parametrize("squeeze", [True, False]) -def test_slice_and_squeeze(client, external_assets_folder, squeeze): - tw = TiledWriter(client) - - documents = render_templated_documents("external_assets_single_key.json", external_assets_folder) - for item in documents: - name, doc = item["name"], item["doc"] - if name == "start": - uid = doc["uid"] - - # Modify the documents to add slice and squeeze parameters - if name == "descriptor": - doc["data_keys"]["det-key2"]["shape"] = [1, 17] if squeeze else [1, 5, 17] - elif name in {"resource", "stream_resource"}: - doc["parameters"]["slice"] = ":,5,:" if squeeze else ":,:5,:" - doc["parameters"]["squeeze"] = squeeze - doc["parameters"]["chunk_shape"] = [1] - - tw(name, doc) - - # Try reading the imported data - assert client[uid]["primary"].read() is not None - - -def test_legacy_multiplier_parameter(client, external_assets_folder): - tw = TiledWriter(client) - - documents = render_templated_documents("external_assets_single_key.json", external_assets_folder) - for item in documents: - name, doc = item["name"], item["doc"] - if name == "start": - uid = doc["uid"] - - # Modify the documents to add slice and squeeze parameters - if name == "descriptor": - doc["data_keys"]["det-key2"]["shape"] = [13, 17] - elif name in {"resource", "stream_resource"}: - doc["parameters"]["multiplier"] = 1 - - tw(name, doc) - - # Try reading the imported data - assert client[uid]["primary"].read() is not None - - -def test_streams_with_no_events(client, external_assets_folder): - tw = TiledWriter(client) - - for item in render_templated_documents("external_assets_single_key.json", external_assets_folder): - name, doc = item["name"], item["doc"] - if name == "start": - uid = doc["uid"] - - # Skip the resource and datum documents - if name in {"resource", "stream_resource", "datum", "stream_datum", "event"}: - continue - - tw(name, doc) - - # Try reading the data -- should return an empty dataset - assert client[uid]["primary"].read() is not None - assert client[uid]["primary"].read().data_vars == {} - assert client[uid]["primary"].metadata is not None - - -@pytest.mark.parametrize("include_data_sources", [True, False]) -@pytest.mark.parametrize("fname", ["internal_events", "external_assets", "external_assets_legacy"]) -def test_zero_gets(client, external_assets_folder, fname, include_data_sources): - client = client.new_variation(include_data_sources=include_data_sources) - assert client._include_data_sources == include_data_sources - tw = TiledWriter(client) - assert bool(tw.client._include_data_sources) - - with record_history() as history: - for item in render_templated_documents(fname + ".json", external_assets_folder): - tw(**item) - - # Count the number of GET requests - num_gets = sum(1 for req in history.requests if req.method == "GET") - assert num_gets == 0 - - -def test_bad_document_order(client, external_assets_folder): - """Test that the TiledWriter can handle documents in a different order than expected - - Emit datum documents in the end, before the Stop document, but after corresponding Event documents. - """ - tw = TiledWriter(client) - - document_cache = [] - for item in render_templated_documents("external_assets_legacy.json", external_assets_folder): - name, doc = item["name"], item["doc"] - if name == "start": - uid = doc["uid"] - - if name == "datum": - document_cache.append({"name": name, "doc": doc}) - continue - - if name == "stop": - for cached_item in document_cache: - tw(**cached_item) - - tw(**item) - - run = client[uid] - - for stream in run.values(): - assert stream.read() is not None - assert "time" in stream.keys() - assert "seq_num" in stream.keys() - assert len(stream.keys()) > 2 # There's at least one data key in addition to time and seq_num - - -def test_json_backup(client, tmpdir, monkeypatch): - def patched_event(name, doc): - raise RuntimeError("This is a test error to check the backup functionality") - - monkeypatch.setattr("bluesky_tiled_plugins.writing.tiled_writer._RunWriter.event", patched_event) - - tw = TiledWriter(client, backup_directory=str(tmpdir)) - - for item in render_templated_documents("internal_events.json", ""): - name, doc = item["name"], item["doc"] - if name == "start": - uid = doc["uid"] - tw(**item) - - run = client[uid] - - assert "primary" in run # The Descriptor was processed and the primary stream was created - assert run["primary"].read() is not None # The stream can be read - assert len(run["primary"].read()) == 0 # No events were processed due to the error - assert "stop" in run.metadata # The TiledWriter did not crash - - # Check that the backup file was created - filepath = tmpdir / f"{uid[:8]}.jsonl" - assert filepath.exists() - with open(filepath) as f: - lines = [json.loads(line) for line in f if line.strip()] - assert len(lines) == 7 - assert lines[0]["name"] == "start" - assert lines[1]["name"] == "descriptor" - assert lines[2]["name"].startswith("event") - assert lines[6]["name"] == "stop" - - -@pytest.mark.parametrize( - "max_array_size, expected_scheme", [(0, "file"), (4, "file"), (16, "duckdb"), (-1, "duckdb")] -) -def test_internal_arrays_written_as_zarr(client, max_array_size, expected_scheme): - tw = TiledWriter(client, max_array_size=max_array_size) - - for item in render_templated_documents("internal_events.json", ""): - name, doc = item["name"], item["doc"] - if name == "start": - uid = doc["uid"] - tw(**item) - - run = client[uid] - - assert run["primary"]["long"].shape == (3, 8) - assert run["primary"]["long"].read() is not None - - # There's a table and it is stored in the SQL database - internal_table = run["primary"].base["internal"] - assert internal_table.read() is not None - assert urlparse(internal_table.data_sources()[0].assets[0].data_uri).scheme == "duckdb" - - if expected_scheme == "file": - assert "long" in run["primary"].base # There's a separate node for the array data - assert "long" not in internal_table.columns # The internal table does not have a column for it - assert urlparse(run["primary"]["long"].data_sources()[0].assets[0].data_uri).scheme == "file" - else: - assert "long" not in run["primary"].base - assert "long" in internal_table.columns - assert run["primary"]["long"].data_sources() is None diff --git a/bluesky-tiled-plugins/tests/test_utils.py b/bluesky-tiled-plugins/tests/test_utils.py deleted file mode 100644 index 4d8d5f5e9..000000000 --- a/bluesky-tiled-plugins/tests/test_utils.py +++ /dev/null @@ -1,47 +0,0 @@ -import orjson -from bluesky_tiled_plugins.utils import truncate_json_overflow - - -def test_truncate_json_overflow(): - # Test with a large integer - data = {"large_pos_int": 2**60, "large_neg_int": -(2**60)} - truncated_data = truncate_json_overflow(data) - assert orjson.dumps(truncated_data, option=orjson.OPT_STRICT_INTEGER) - for val in orjson.loads(orjson.dumps(truncated_data, option=orjson.OPT_STRICT_INTEGER)).values(): - assert val is not None - - # Test with a large float - data = {"large_pos_float": 2e308, "large_neg_float": -2e308} - truncated_data = truncate_json_overflow(data) - assert orjson.dumps(truncated_data, option=orjson.OPT_STRICT_INTEGER) - for val in orjson.loads(orjson.dumps(truncated_data, option=orjson.OPT_STRICT_INTEGER)).values(): - assert val is not None - - # Test with a list of large integers and floats - data = [[2**60, -(2**60)], [2e308, -2e308]] - truncated_data = truncate_json_overflow(data) - assert orjson.dumps(truncated_data, option=orjson.OPT_STRICT_INTEGER) - - # Test with a dictionary containing various types - data = { - "int": 42, - "float": 3.14, - "str": "Hello, world!", - "list": [1, 2, 3], - "dict": {"key": "value"}, - "large_int": 2**60, - "large_float": 2e308, - "nested": { - "large_neg_int": -(2**60), - "large_neg_float": -2e308, - "list_of_large_ints": [2**60, -(2**60)], - "list_of_large_floats": [2e308, -2e308], - }, - } - truncated_data = truncate_json_overflow(data) - assert orjson.dumps(truncated_data, option=orjson.OPT_STRICT_INTEGER) - - # Test with a NaN value - data = {"nan": float("nan")} - truncated_data = truncate_json_overflow(data) - assert orjson.loads(orjson.dumps(truncated_data, option=orjson.OPT_STRICT_INTEGER))["nan"] is None diff --git a/continuous_integration/scripts/install.sh b/continuous_integration/scripts/install.sh index fce06a5c5..efec551fb 100644 --- a/continuous_integration/scripts/install.sh +++ b/continuous_integration/scripts/install.sh @@ -9,6 +9,5 @@ python -m pip install --upgrade pip setuptools wheel numpy # Versioneer uses the most recent git tag to generate __version__, which appears # in the published documentation. git fetch --tags -python -m pip install .[all] -python -m pip install ./bluesky-tiled-plugins +python -m pip install ".[all]" python -m pip list diff --git a/databroker/tutorial_utils.py b/databroker/tutorial_utils.py deleted file mode 100644 index 853816e4f..000000000 --- a/databroker/tutorial_utils.py +++ /dev/null @@ -1,95 +0,0 @@ -import io -import os -from pathlib import Path -from shutil import copyfileobj -import sys -import zipfile - -import appdirs -import databroker -from databroker_pack import unpack_inplace -import requests -from tqdm.auto import tqdm # automatically chooses tqdm.tqdm or tqdm.notebook -from tqdm.utils import CallbackIOWrapper - - -DEFAULT_DATA_DIR = Path(appdirs.user_data_dir(), "bluesky_tutorial_data") -data_dir = os.getenv("BLUESKY_TUTORIAL_DATA", DEFAULT_DATA_DIR) - - -def _extractall_with_progress_bar(source, dest): - "Unzip source into dest, updating a progress bar as we go." - # Derived from https://stackoverflow.com/a/65513860rchive.extractall(directory) - dest = Path(dest).expanduser() - with zipfile.ZipFile(source) as zipf, tqdm( - desc="Extracting", - unit="iB", - unit_scale=True, - unit_divisor=1024, - total=sum(getattr(i, "file_size", 0) for i in zipf.infolist()), - ) as pbar: - os.makedirs(dest, exist_ok=True) - for i in zipf.infolist(): - if not getattr(i, "file_size", 0): # directory - zipf.extract(i, os.fspath(dest)) - else: - with zipf.open(i) as fi, open(os.fspath(dest / i.filename), "wb") as fo: - copyfileobj(CallbackIOWrapper(pbar.update, fi), fo) - print(f"Extracted internal data files to {dest}", file=sys.stderr) - - -def _download_with_progress_bar(response, buffer): - "Stream the data from the response into the buffer, updating a progress bar as we go." - # Derived from https://stackoverflow.com/a/37573701 - response.raise_for_status() - total_size = int(response.headers.get("Content-Length", 0)) - block_size = 1024 - with tqdm( - total=total_size, unit="iB", unit_scale=True, desc="Downloading" - ) as progress_bar: - for chunk in response.iter_content(block_size): - progress_bar.update(len(chunk)) - buffer.write(chunk) - - -def _fetch_into_memory_and_unzip_to_disk(name, url): - if name in databroker.catalog: - return databroker.catalog[name] - buffer = io.BytesIO() - directory = Path(data_dir, name) - with requests.get(url, stream=True) as response: - _download_with_progress_bar(response, buffer) - _extractall_with_progress_bar(buffer, directory) - config_path = unpack_inplace(directory, name) - print( - f"Placed config file at {config_path} to add this catalog to databroker.catalog.\n", - "Access it at any time via\n\n" - " import databroker\n" - f" databroker.catalog['{name}'].", - file=sys.stderr, - ) - # If the config directory did not exist at import time when - # intake.catalog.default.load_combo_catalog() was run, it will never be - # checked again. We need to explicitly add it. - combo_catalog_path = databroker.catalog._catalogs[-1].path - for ext in ["*.yml", "*.yaml"]: - path = os.path.join(os.path.dirname(config_path), "*.yml") - if path not in combo_catalog_path: - combo_catalog_path.append(path) - databroker.catalog.force_reload() - - -def fetch_BMM_example(version=1): - if version != 1: - raise ValueError("Only version 1 is known.") - name = "bluesky-tutorial-BMM" - url = "https://nsls2datasamples.blob.core.windows.net/bluesky-tutorial-example-data/BMM-example-v1.zip" - return _fetch_into_memory_and_unzip_to_disk(name, url) - - -def fetch_RSOXS_example(version=1): - if version != 1: - raise ValueError("Only version 1 is known.") - name = "bluesky-tutorial-RSOXS" - url = "https://nsls2datasamples.blob.core.windows.net/bluesky-tutorial-example-data/RSOXS-example-v1.zip" - return _fetch_into_memory_and_unzip_to_disk(name, url) diff --git a/docs/source/conf.py b/docs/source/conf.py index 7a5cbdbaf..bd1a27edc 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -46,7 +46,6 @@ 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx.ext.autosummary', - 'sphinx.ext.extlinks', 'numpydoc', 'IPython.sphinxext.ipython_directive', 'IPython.sphinxext.ipython_console_highlighting', diff --git a/docs/source/explanations/local-and-remote-use-cases.rst b/docs/source/explanations/local-and-remote-use-cases.rst deleted file mode 100644 index 3e6cb4f58..000000000 --- a/docs/source/explanations/local-and-remote-use-cases.rst +++ /dev/null @@ -1,107 +0,0 @@ -How can it be used locally and remotely? -======================================== - -The bluesky ecosystem provides several modes for accessing data: - -* Access Central DataBroker via a Generic Remote Client --- This includes - Remote Desktop, Jupyter, and SSH. -* Portable DataBroker with Local Data --- Let users use ``databroker`` on their - laptops and/or on servers at their home institutions, with all the relevant - data copied locally and no need for a network connection. -* Portable DataBroker with Remote Data --- Let users use ``databroker`` on their - laptops and/or on servers at their home institutions, pulling data from an - HTTP server on demand, and optionally caching it locally. -* Traditional File Export --- Export data to files for existing software that - expects files in a certain format named a certain way. - - -Access Central DataBroker via a Generic Remote Client ------------------------------------------------------ - -In this mode, users do not install ``databroker`` locally. They use any remote -client---such as Remote Desktop, Jupyter, or SSH---to access a Python -environment on the source machine, and use ``databroker`` there, which -presumably has fast access to the data storage and some compute resources. - - -Portable DataBroker with Local Data ------------------------------------ - -DataBroker is not itself a data store; it is a Python library for accessing -data across a variety of data stores. Therefore, it can be run on a laptop -without network connectivity, accessing data stored in ordinary files or in -a local database. Both are officially supported. - -The process involves: - -#. Identify a subset of the data to be copied locally from the source - institution, given as a query (e.g. a time range) or a list of unique - identifiers. Export the documents into a file-based format (typically - msgpack). Copy any of the large "external" files (e.g. TIFF or HDF5 files - generated by large detectors). -#. Transfer all of this to the target machine, perhaps via ``rsync`` or Globus. - Place a configuration file discoverable by ``databroker`` that points to the - location where the files were transferred. -#. Install the Python library ``databroker`` on the target machine using pip or - conda. - -DataBroker can work on top of a directory of ordinary files just fine; it even -supports the same queries that it would normally run on a database---just less -efficiently. Optionally, ingest the documents into a local database to support -more efficient queries. - -The small utility -`databroker-pack `_ streamlines the -process of "packing" some data from data broker into portable files and -"unpacking" them at their destination. - -Portable DataBroker with Remote Data ------------------------------------- - -In this mode, data copying would happen invisibility to the user and only on -demand. The process involves: - -#. Install the Python library ``databroker`` on the target machine using pip or - conda. -#. Provide databroker with the URL of a remote "remote data catalog" running - that the source facility. - -The user experience from there is exactly the same where the data happens to be -local or remote. Thus, users could write code in one mode and seamless -transition to the other. - -Data is downloaded on demand, and it may be cached locally so that it need not -be repeatedly downloaded. This requires a stable URL and a reliable network -connection. There are *no instances of this mode* known at this time, but all -the software pieces to achieve it exist. It is on the project roadmap. - -Traditional File Export ------------------------ - -Export the data to files (e.g. TIFFs and/or CSVs) with the metadata of your -choice encoded in filenames. This mode forfeits much of the power of databroker -and the bluesky ecosystem generally, but it is important for supporting -existing workflows and software that expects files in a certain format named a -certain way. - -We expect this mode to become less useful as data sizes increase and scientific -software literacy grows over time. It is a bridge. - -Streaming Export -^^^^^^^^^^^^^^^^ - -This means exporting the data during data acquisition such that partial results -are available for reading. The bluesky -`suitcase `_ project provides a pattern -for doing this and ready-to-use implementations for popular formats. - -The streaming export tools may also be used after data acquisition. - -Prompt Export -^^^^^^^^^^^^^ - -This means exporting the data at the end of data acquisition. (To be precise, -at the end of each "Bluesky Run". The scope of a "Run" is up to the details of -the data acquisition procedure.) This is typically much simpler than streaming -export and can be implemented *ad hoc* by accessing the data from databroker -and writing out a file using the relevant Python I/O library. diff --git a/docs/source/explanations/relationship-to-intake.rst b/docs/source/explanations/relationship-to-intake.rst deleted file mode 100644 index 0c196d396..000000000 --- a/docs/source/explanations/relationship-to-intake.rst +++ /dev/null @@ -1,98 +0,0 @@ -.. currentmodule:: databroker.core - -What is Databroker's relationship to Intake? -============================================ - -Intake Concepts ---------------- - -Intake has a notion of Catalogs. Catalogs are roughly dict-like. Iterating over -a Catalog yields the names of its entries, which are strings. Iterating over -``catalog.items()`` yields ``(name, Entry)`` pairs. An Entry is roughly like -a ``functools.partial`` with metadata and intake-specific semantics. When an -Entry is opened, by calling it ``entry.get()`` or, equivalently and more -succinctly, ``entry()``, it returns its content. The content could be another -Catalog or a DataSource. - -Calling ``.read()`` on a DataSource returns some in-memory representation, such -as a numpy array, pandas DataFrame, or xarray.Dataset. Calling ``.to_dask()`` -return the "lazy" dask-backed equivalent structure. - -DataBroker Concepts -------------------- - -DataBroker represents a Bluesky "Event Stream", a logical table of data, as a -DataSource, :class:`BlueskyEventStream`. Calling -:meth:`BlueskyEventStream.read` returns an xarray Dataset backed by numpy -arrays; calling :meth:`BlueskyEventStream.to_dask` returns an xarray Dataset -backed by dask arrays. - -DataBroker represents a Bluesky Run, sometimes loosely referred to as a "scan", -as a Catalog of Event Streams, :class:`BlueskyRun`. For example, the entries in -a :class:`BlueskyRun` might have the names ``'primary'`` and ``'baseline'``. -The entries always contain instances of :class:`BlueskyEventStream`. -:class:`BlueskyRun` extends the standard Catalog interface with a special -method :meth:BlueskyRun.documents`. This returns a generator that yields -``(name, doc)`` pairs, recreating the stream of documents that would have been -emitted during data acquisition. (This is akin to ``Header.documents()`` in -DataBroker v0.x.) - -:class:`BlueskyEventStream` and :class:`BlueskyRun` should never be -instantiated by the user. They have complex signatures, and they are agnostic -to the storage mechanism; they could be backed by objects in memory, files, or -databases. - -Continuing to move up the hierarchy, we get to catalogs whose Entries contain -:class:`BlueskyRun` instances. Each entry's name is the corresponding RunStart -``uid``. The Catalogs at this level of the hierarchy include: - -.. currentmodule:: databroker - -* :class:`_drivers.jsonl.BlueskyJSONLCatalog` -* :class:`_drivers.msgpack.BlueskyMsgpackCatalog` -* :class:`_drivers.mongo_normalized.BlueskyMongoCatalog` -* :class:`_drivers.mongo_embedded.BlueskyMongoCatalog` - -Notice that these are located in an internal package, ``_drivers``. Except for -testing purposes, they should never be directly imported. They should be -accessed by their name from intake's driver registry as in: - -.. code:: python - - import intake - cls = intake.registry['bluesky-jsonl-catalog'] - -At some point in the future, once the internal APIs stabilize, these classes -and their specific dependencies (msgpack, pymongo, etc.) will be moved out of -databroker into separate packages. Avoid directly importing from ``_drivers`` -so that this change will not break your code. - -Scaling Intake Catalogs ------------------------ - -To make Catalogs scale to tens of thousands of entries, override the methods: - -* ``__iter__`` -* ``__getitem__`` -* ``__contains__`` -* ``__len__`` - -A simple intake Catalog populates an internal dictionary, ``Catalog._entries``, -mapping entry names to :class:`LocalCatalogEntry` objects. This approach does -not scale to catalogs with large number of entries, where merely populating the -keys of the ``Catalog._entries`` dict is expensive. To customize the type of -``_entries`` override :meth:`Catalog._make_entries_container` and return a -dict-*like* object. This object must support iteration (looping through part or -all of the catalog in order) and random access (requesting a specific entry by -name) by implementing ``__iter__`` and ``__getitem__`` respectively. - -It should also implement ``__contains__`` because, similarly, if -``__contains__`` is specifically implemented, Python will iterate through all the -entries and check each in turn. In this case, it is likely more efficient to -implement a ``__contains__`` method that uses ``__getitem__`` to determine -whether a given key is contained. - -Finally, the Catalog itself should implement ``__len__``. If it is not -implemented, intake may obtain a Catalog's length by iterating through it -entirely, which may be costly. If a more efficient approach is possible (e.g. a -COUNT query) it should be implemented. diff --git a/docs/source/explanations/v2-transition.rst b/docs/source/explanations/v2-transition.rst deleted file mode 100644 index 338dad718..000000000 --- a/docs/source/explanations/v2-transition.rst +++ /dev/null @@ -1,114 +0,0 @@ -.. _v2-transition: - -What are the API versions v0, v1, v2? -===================================== - -The Databroker codebase currently contains two complete and mostly independent -implementations. One is the original implementation from 2015. The other is a -rewrite performed in 2019--2020, written to leverage the scientific Python -libraries `dask`_, `intake`_, and `xarray`_, which emerged or reached maturity -some time after 2015. Databroker 1.x provides three public interfaces on top of -these two implementations. - -=========== ========= ============== ================================================== -API version Interface Implementation Who should use it? -=========== ========= ============== ================================================== -v2 New New All new users -v1 Original New Users with old scripts that use original interface -v0 Original Original Users who hit bugs in v1/v2 and need a fallback -=========== ========= ============== ================================================== - -Which interface should I use? ------------------------------ - -If you are a new user, use v2. That is the version covered by the tutorials and -user guides. - -As far as we know the only heavy users of the "original" 2015 interface are at -NSLS-II. If you are such a user and you have have existing scripts using that -original interface, know that we committed to supporting it for many years to -come. We do not want to break your scripts. Consider using v2 for *new* work, -however, to enjoy its improved usability and feature set. - -Do they use the same storage? ------------------------------ - -Both implemenations integrate with `external assets`_ (e.g. large arrays from -imaging detectors) in exactly the same way. - -Both use the same MongoDB storage layout for Bluesky documents. You can access -the same MongoDB database from v0, v1, and v2, moving between them seamlessly. - -However, the original (v0) implementation also supported sqlite for very -lightweight use cases and had an experimental HDF5-based storage. Both of these -are deprecated. Instead, the new implementation (v2 / v1) adds support for -msgpack- and JSONL-backed storage, which have proven to be a better solution -for very lightweight use cases. (See :ref:`migration_from_v0_storage`.) More -are storage options are planned for early 2021, with an emphasis on efficient -binary formats, such as `TileDB`_. - -How do I use them? ------------------- - -All of the tutorials now use ``databroker.v2``. As they show, this gets you a -v2-style catalog. - -.. code:: python - - # v2, recommended for new users - import databroker - catalog = databroker.catalog["MY CATALOG NAME"] - -This older-style usage gets you a v1-style catalog. - -.. code:: python - - # v1, supported for existing users supporting old code - from databroker import Broker - db = Broker.named("MY CATALOG NAME") - -Since there are just different interfaces built on the same underlying (new) -implementation, we can easily to move between them. - -.. code:: python - - catalog # v2 - catalog.v1 # v1 - catalog.v2 # v2 (just returns a reference to itself) - db # v1 - db.v1 # v1 (just returns a reference to itself) - db.v2 # v2 - -Therefore, code written like - -.. code:: python - - def f(catalog_or_db): - catalog = catalog_or_db.v2 # ensure we have a v2 interface - ... - -will work on both v1-style and v2-style. - -Finally, the v0 implementation is available as the battle-tested emergency -fallback in case of any show-stopping bugs the newer implementation underlying -v1 and v2. You *cannot* move between v0 and other interfaces. You can invoke v0 -like so: - -.. code:: python - - # v0, emergency fallback if v1/v2 is broken - from databroker.v0 import Broker - db = Broker.named("MY CATALOG NAME") - -In the future, we will remove v0 from the codebase; v1 will be sufficient to -support old user code. - -.. _intake: https://intake.readthedocs.io - -.. _xarray: https://xarray.pydata.org/ - -.. _dask: https://dask.org/ - -.. _TileDB: https://tiledb.com/ - -.. _external assets: https://blueskyproject.io/event-model/external.html diff --git a/docs/source/how-to/download-data-samples.rst b/docs/source/how-to/download-data-samples.rst deleted file mode 100644 index 4fcb7417b..000000000 --- a/docs/source/how-to/download-data-samples.rst +++ /dev/null @@ -1,26 +0,0 @@ -How to download some Catalogs with data samples -=============================================== - -*I want to download some sample datasets and access them with Databroker.* - -As shown at the top of every tutorial, there are some curated datasets -available via this one-liner. This are open `CC0`_-licensed data contributed -by Bluesky users. - -.. code:: python - - import databroker.tutorial_utils - - # Catalog of many small Runs - catalog = databroker.tutorial_utils.fetch_BMM_example() - - # Catalog with one large Run (1 GB uncompressed) - catalog = databroker.tutorial_utils.fetch_RSOXS_example() - -These are the first two of many to come. We are actively curating more. -If you have some to offer, please open an issue on the -`bluesky/data-samples issue tracker`_. - -.. _CC0: https://creativecommons.org/share-your-work/public-domain/cc0/ - -.. _bluesky/data-samples issue tracker: https://github.com/bluesky/data-samples/issues/new diff --git a/docs/source/how-to/file-backed-catalog.rst b/docs/source/how-to/file-backed-catalog.rst deleted file mode 100644 index 94a3ea883..000000000 --- a/docs/source/how-to/file-backed-catalog.rst +++ /dev/null @@ -1,103 +0,0 @@ -How to create a new Catalog backed by files -=========================================== - -*I want to quickly set up a small Databroker Catalog.* - -Why backed by files and not a database? ---------------------------------------- - -Pro: You don't have to run a database to try it. - -Cons: It only scales to about 100 Runs, it will be a bit slower, and you lose -*some* of the search capability, such as full text search. - -Databroker works best when backed by a proper database, but for tiny -deployments of up to about 100 Runs, Databroker can run on top of ordinary -files on disk. This can be convenient in these situations: - -* Just trying things out -* Testing -* Tutorials and demos -* Sending users home with a "portable databroker" without setting up a - database, as long as they don't have a large number of Runs - -Temporary Catalog ------------------ - -If you are in the "just trying things" phase, you might start by creating a -*temporary* Catalog backed by file in your system's temp directory. It will -be hard to find again after you exit Python, and it will be permanently deleted -whenever you system next reboots, so do not put anything important (or -especially large) there. - -.. code:: python - - from databroker import temp - catalog = temp() - # That's it! - -See :doc:`store-data-from-run-engine` or -:doc:`store-analysis-results` to put some actual data in there, and see -the tutorials for how to get it back out. - -Persistent Catalog ------------------- - -Taking the next step, let's make a persistent Catalog. - -#. Find where Databroker looks for Catalog configuration files on your system. - It varies by OS and environment because Databroker does its best to be a - polite guest and place configuration files where the local conventions - dictate. Run this snippet to find the list of paths where it looks - on your system. - - .. code:: bash - - python3 -c "import databroker; print(databroker.catalog_search_path())" - -#. Compose a configuration file like this. The filename of the configuration - file is unimportant, but using ``CATALOG_NAME.yml`` is conventional. The - file should be placed in any one of the directories listed by the previous - step. - - .. code:: yaml - - sources: - CATALOG_NAME: - driver: "bluesky-msgpack-catalog" - args: - paths: - - "DESTINATION_DIRECTORY/*.msgpack" - - where ``CATALOG_NAME`` is a name of the entry that will appear in - ``databroker.catalog``, and ``DESTINATION_DIRECTORY`` is where the data - will be stored. - - Note that the value of ``paths`` is a list. Multiple data directories may be - grouped into one "source". - -#. Now ``CATALOG_NAME`` should appear in - - .. code:: python - - import databroker - - # List catalog names. - list(databroker.catalog) - - If it does not appear, call ``databroker.catalog.force_reload()`` and retry. - The catalog may be accessed like - - .. code:: python - - import databroker - - # List catalog names. - list(databroker.catalog) - - using the ``CATALOG_NAME`` in the text of the configuration file. (Again, - the *filename* of the configuration file is not relevant.) - -See :doc:`store-data-from-run-engine` or -:doc:`store-analysis-results` to put some actual data in there, and see -the tutorials for how to get it back out. diff --git a/docs/source/how-to/mongo-backed-catalog.rst b/docs/source/how-to/mongo-backed-catalog.rst deleted file mode 100644 index 352f56331..000000000 --- a/docs/source/how-to/mongo-backed-catalog.rst +++ /dev/null @@ -1,120 +0,0 @@ -How to create a new Catalog backed by MongoDB -============================================= - -*I want to set up a performant Catalog that scales to large numbers of Runs and -supports the full search capability of Databroker.* - -#. Install the `MongoDB Community Edition`_. We recommend the latest stable - version. Any version 3.x or later should be fine. Alternatively, you can - run MongoDB in the `MongoDB Docker container`_ maintained by Docker. See - :ref:`container_advice` below if you go this route. - -#. Find where Databroker looks for Catalog configuration files on your system. - It varies by OS and environment because Databroker does its best to be a - polite guest and place configuration files where the local conventions - dictate. Run this snippet to find the list of paths where it looks - on your system. - - .. code:: bash - - python3 -c "import databroker; print(databroker.catalog_search_path())" - -#. Compose a configuration file like this. The filename of the configuration - file is unimportant, but using ``CATALOG_NAME.yml`` is conventional. The - file should be placed in any one of the directories listed by the previous - step. - - .. code:: yaml - - sources: - CATALOG_NAME: - driver: bluesky-mongo-normalized-catalog - args: - metadatastore_db: mongodb://HOST:PORT/DATABASE_NAME - asset_registry_db: mongodb://HOST:PORT/DATABASE_NAME - - where ``CATALOG_NAME`` is a name of the entry that will appear in - ``databroker.catalog``. The two database URIs, ``metadatastore_db`` and - ``asset_registry_db``, are distinct only for historical reasons. For new - deployments, we recommend that you set them to the same value---i.e. that - you use one database shared by both. - - If you are using Databroker on the same system where you are running - MongoDB, then the URI would be ``mongodb://localhost:27017/DATABASE_NAME`` - where ``DATABASE_NAME`` is fully up to you. - -#. Now ``CATALOG_NAME`` should appear in - - .. code:: python - - import databroker - - # List catalog names. - list(databroker.catalog) - - If it does not appear, call ``databroker.catalog.force_reload()`` and retry. - The catalog may be accessed like - - .. code:: python - - catalog = databroker.catalog[CATALOG_NAME] - - using the ``CATALOG_NAME`` in the text of the configuration file. (Again, - the *filename* of the configuration file is not relevant.) - -See :doc:`store-data-from-run-engine` or -:doc:`store-analysis-results` to put some actual data in there, and see -the tutorials for how to get it back out. - -Security --------- - -Databroker was designed with access controls *per Run* in mind, and this is now -being actively developed, but currently only all-or-nothing access is -supported: Users can access all the Runs in the MongoDB or none of them. - -#. `Enable authentication on MongoDB`_. Following those instructions, create a - user with read and write access to your database and set a secure password. - -#. Edit your configuration file as to add a template for username and password - in the URI as follows. Notice the addition of the query parameter - ``authSource=admin`` as well. - - .. code:: yaml - - metadatastore_db: mongodb://{{ env(DATABROKER_MONGO_USER) }}:{{ env(DATABROKER_MONGO_PASSWORD) }}@HOST:PORT/DATABASE_NAME?authSource=admin - asset_registry_db: mongodb://{{ env(DATABROKER_MONGO_USER) }}:{{ env(DATABROKER_MONGO_PASSWORD) }}@HOST:PORT/DATABASE_NAME?authSource=admin - - - Refer to `PyMongo authentication documentation`_ for context. - -#. Set these environment variables to provide access to the database. - - .. code:: bash - - export DATABROKER_MONGO_USER='...' - export DATABROKER_MONGO_PASSWORD='...' - -.. _container_advice: - -Container Advice ----------------- - -If you choose to run MongoDB in a Docker container: - -* Be sure to mount persistent storage from the host machine into the volumes - MongoDB stores it data. When the container stops, you presumably still want - your data! -* See `this resource`_ for information on enabling authentication. - -.. _MongoDB Community Edition: https://docs.mongodb.com/manual/administration/install-community/ - -.. _MongoDB Docker container: https://hub.docker.com/_/mongo - -.. _Enable authentication on MongoDB: https://docs.mongodb.com/manual/tutorial/enable-authentication/ - -.. _PyMongo authentication documentation: https://pymongo.readthedocs.io/en/stable/examples/authentication.html#default-database-and-authsource - -.. _container: https://hub.docker.com/_/mongo - -.. _this resource: https://stackoverflow.com/a/42973849 diff --git a/docs/source/how-to/store-analysis-results.rst b/docs/source/how-to/store-analysis-results.rst deleted file mode 100644 index afb92db4a..000000000 --- a/docs/source/how-to/store-analysis-results.rst +++ /dev/null @@ -1,80 +0,0 @@ -How to store analysis results -============================= - -*I want to access analysis results using Databroker.* - -.. important:: - - This is very new work and should be treated as experimental. - -Databroker is designed to hold processed and analyzed data as well as raw data. -Currently, it is most commonly used for the former, but it was designed for -both from the start. - -When collecting raw data, we can rely on Ophyd and the RunEngine to organize -our data and metadata in a structure recognizable to Databroker. When capturing -analysis results, we have to do some of that works ourselves. - -#. Install ``bluesky-live``. If you have a recent version of databroker - (v1.2.0 or higher) then you already have it; it's a requirement of - databroker's. - - .. code:: bash - - pip install bluesky-live - -#. Organize your data into a BlueskyRun. If your data can be represented as - single table (i.e. a spreadsheet) with dictionary of metadata, then you can - use this simplified interace. - - .. code:: python - - from bluesky_live.run_builder import build_simple_run - - run = buiLd_simple_run({'x': [1, 2, 3], 'y': [4, 5, 6]}, metadata={'sample': 'Cu'}) - - Here, our "table" is given as a dict of lists, but the following are also accepted: - - * dict of numpy arrays - * pandas DataFrame - * xarray Dataset - - This is approach is equivalent: - - .. code:: python - - from bluesky_live.run_builder import RunBuilder - - with RunBuilder(metadata={'sample': 'Cu'}) as builder: - builder.add_stream("primary", data={'x': [1, 2, 3], 'y': [10, 20, 30]}) - run = builder.get_run() - - and, unlike ``build_simple_run``, it extends to multiple streams (i.e. - tables or spreadsheets), as in - - .. code:: python - - with RunBuilder(metadata={'sample': 'Cu'}) as builder: - builder.add_stream("primary", data={'x': [1, 2, 3], 'y': [10, 20, 30]}) - builder.add_stream("baseline", data={'A': [-1, -1], 'B': [250, 250]}) - run = builder.get_run() - -#. Store your BlueskyRun. - - .. code:: python - - for name, doc in run.documents(): - catalog.v1.insert(name, doc) - -.. note:: - - You may notice that we are falling back to the ``v1`` API here, where for - all other things we show and recommend the new ``v2`` API. This is because - we are still `discussing the design`_ for this in v2. Until that is sorted - out, this is the officially-recommended solution. - - It uses `Suitcase`_ internally to do the writing. - -.. _discussing the design: :issue:`605` - -.. _Suitcase: https://blueskyproject.io/suitcase/ diff --git a/docs/source/how-to/store-data-from-run-engine.rst b/docs/source/how-to/store-data-from-run-engine.rst deleted file mode 100644 index 1577cbda9..000000000 --- a/docs/source/how-to/store-data-from-run-engine.rst +++ /dev/null @@ -1,52 +0,0 @@ -How to store data from the Run Engine -===================================== - -*I want to connect RunEngine with Databroker and start saving data.* - -For small- and medium-sized deployments ---------------------------------------- - -Subscribe the Run Engine directly to Databroker. - -.. code:: python - - RE.subscribe(catalog.v1.insert) - -.. note:: - - You may notice that we are falling back to the ``v1`` API here, where for - all other things we show and recommend the new ``v2`` API. This is because - we are still `discussing the design`_ for this in v2. Until that is sorted - out, this is the officially-recommended solution. - - It uses `Suitcase`_ internally to do the writing. - -This will cause the RunEngine to wait for each document it emits to be stored -successfully before it proceeds with the next step of data acquisition. - -Pro: We are assured that if data is not saved successfully, we will immediately -know and the data acquisition will be aborted. We avoid the scary scenario of -thinking we are saving data when we are not. - -Con: By waiting for data to make it all the way into the database, data -acquisition will be marginally slower than if we took a more sophisticated -approach. - -For facility-scale deployments ------------------------------- - -At present, all facilities currently using Bluesky (as far as we are aware) are -using this straightforward approach described above but a more sophisticated -alternative is being tested. - -In short, put a message bus such as Kafka between the Run Engine and the -database. Tooling for this is under development at -`bluesky-kafka`_. Check back here for updates later in 2021. - -.. _discussing the design: :issue:`605` - -.. _Suitcase: https://blueskyproject.io/suitcase/ - -.. _Kafka: https://kafka.apache.org/ - -.. _bluesky-kafka: https://github.com/bluesky/bluesky-kafka diff --git a/docs/source/index.rst b/docs/source/index.rst index be69b097d..d831eed31 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,76 +1,10 @@ .. include:: ../../README.rst -How the documentation is structured ------------------------------------ - -.. rst-class:: columns - -:ref:`tutorials` -~~~~~~~~~~~~~~~~ - -Tutorials for installation and usage. New users start here. - -.. rst-class:: columns - -:ref:`how-to` -~~~~~~~~~~~~~ - -Practical step-by-step guides for the more experienced user. - -.. rst-class:: columns - -:ref:`explanations` -~~~~~~~~~~~~~~~~~~~ - -Explanation of how the library works and why it works that way. - -.. rst-class:: columns - -:ref:`reference` -~~~~~~~~~~~~~~~~ - -Technically detailed API documenation. - -.. rst-class:: endcolumns - About the documentation ~~~~~~~~~~~~~~~~~~~~~~~ `Why is the documentation structured this way? `_ -.. toctree:: - :caption: Tutorials - :name: tutorials - :maxdepth: 1 - - tutorials/install - tutorials/search-and-lookup - tutorials/get-data - tutorials/get-metadata - tutorials/export - -.. toctree:: - :caption: How-to Guides - :name: how-to - :maxdepth: 1 - - how-to/download-data-samples - how-to/file-backed-catalog - how-to/store-data-from-run-engine - how-to/store-analysis-results - how-to/mongo-backed-catalog - -.. toctree:: - :caption: Explanations - :name: explanations - :maxdepth: 1 - - explanations/v2-transition - explanations/local-and-remote-use-cases - explanations/relationship-to-intake - -.. rst-class:: no-margin-after-ul - .. toctree:: :caption: Reference :name: reference diff --git a/docs/source/reference/configuration.rst b/docs/source/reference/configuration.rst index 81fd793b0..b1dbd4360 100644 --- a/docs/source/reference/configuration.rst +++ b/docs/source/reference/configuration.rst @@ -55,8 +55,7 @@ All Databroker "drivers" accept the following arguments: * ``handler_registry`` --- If ommitted or ``None``, the result of - :func:`~databroker.core.discover_handlers` is used. See - :doc:`event-model:external` for background on the role of "handlers". + :func:`~databroker.core.discover_handlers` is used. * ``root_map`` --- This is passed to :func:`event_model.Filler` to account for temporarily moved/copied/remounted files. Any resources which have a ``root`` matching a @@ -212,8 +211,7 @@ Legacy (v0-style) configuration For backward-compatibility, configuration files specifying MongoDB storage are discovered and included in ``databroker.catalog``. Other legacy formats -(SQLite, HDF5) are only accessible via v0. See -:ref:`v2-transition`. +(SQLite, HDF5) are only accessible via v0. Search path ^^^^^^^^^^^ diff --git a/docs/source/reference/v2.rst b/docs/source/reference/v2.rst index c13971c90..af97b94b1 100644 --- a/docs/source/reference/v2.rst +++ b/docs/source/reference/v2.rst @@ -33,25 +33,3 @@ Utils .. autofunction:: databroker.v2.temp .. autofunction:: databroker.v1.temp - -Backend-Specific Catalogs -------------------------- - -.. note:: - - These drivers are currently being developed in databroker itself, but - will eventually be split out into separate repositories to isolate - dependencies and release cycles. This will be done once the internal - interfaces are stable. - -.. autoclass:: databroker._drivers.jsonl.BlueskyJSONLCatalog - :members: - -.. autoclass:: databroker._drivers.mongo_embedded.BlueskyMongoCatalog - :members: - -.. autoclass:: databroker._drivers.mongo_normalized.BlueskyMongoCatalog - :members: - -.. autoclass:: databroker._drivers.msgpack.BlueskyMsgpackCatalog - :members: diff --git a/docs/source/tutorials/export.rst b/docs/source/tutorials/export.rst deleted file mode 100644 index 19162dee1..000000000 --- a/docs/source/tutorials/export.rst +++ /dev/null @@ -1,192 +0,0 @@ -Export Data -=========== - -In this tutorial we will export data from a Run to files. We will do this in -two ways: - -* The simple way, using methods like ``to_csv`` provided by standard scientific - Python tools -* The "streaming" way, using Bluesky's Suitcases - -Set up for Tutorial -------------------- - -Before you begin, install ``databroker`` and ``databroker-pack``, following the -:doc:`install`. - -Start your favorite interactive Python environment, such as ``ipython`` or -``jupyter lab``. - -For this tutorial, we'll use a catalog of publicly available, openly licensed -sample data. Specifically, it is high-quality transmission XAS data from all -over the periodical table. - -This utility downloads it and makes it discoverable to Databroker. - -.. ipython:: python - - import databroker.tutorial_utils - databroker.tutorial_utils.fetch_BMM_example() - -Access the catalog as assign it to a variable for convenience. - -.. ipython:: python - - import databroker - catalog = databroker.catalog['bluesky-tutorial-BMM'] - -Let's take a Run from this Catalog. - -.. ipython:: python - - run = catalog[23463] - -What's in the Run? ------------------- - -The Run's "pretty display", shown by IPython and Jupyter and some other -similar tools, shows us a summary. - -.. ipython:: python - - run - -Each run contains logical "tables" of data called *streams*. We can see them in -the summary above, and we iterate over them programmatically with a ``for`` -loop or with ``list``. - -.. ipython:: python - - list(run) - -Simple Export -------------- - -Export to CSV or Excel -^^^^^^^^^^^^^^^^^^^^^^ - -CSV can be suitable small amounts of scalar data. It's not fast and it's not -particularly good way to store numeric data or rich metadata---but it is -universally understood and human-readable. - -Here, we look at the columns in the primary stream and choose some to export to -CSV. - -.. ipython:: python - - ds = run.primary.read() - ds - columns = ["I0", "It", "Ir", "dcm_energy"] # columns we want to export - df = ds[columns].to_dataframe() - df - # Setting index=False omits the "time" index on the left from the output. - df.to_csv("data.csv", index=False) - -If you target is to get data into Excel, note that you can write Excel files -directly. This requires an additional dependency that you may not already have -installed. - -.. code:: python - - # Install Excel writer used by pandas using pip... - pip install openpyxl - # or conda... - conda install -c conda-forge openpyxl - -.. ipython:: python - - df.to_excel("data.xlsx", index=False) - -Both of these methods have a large number of options to customize the output. -Use ``df.to_csv?`` (IPython, Jupyter) or ``help(df.to_csv)`` to learn more. -Likesie for ``df.to_excel``. - -If you have many runs to do in batch, you may use the metadata to automatically -generate filenames. It is strongly recommended to include part of the globally -unique id, ``uid``, at the end to ensure that names do not clash and overwrite. - -.. ipython:: python - - columns = ["I0", "It", "Ir", "dcm_energy"] - results = catalog.search({"XDI.Element.symbol": "Mn"}) - for uid, run in results.items(): - ds = run.primary.read() - df = ds[columns].to_dataframe() - # Generate filename from metadata. - md = run.metadata["start"] - filename = f'Mn-spectra-{md["scan_id"]}-{md["uid"]:.8}.csv' - df.to_csv(filename, index=False) - print(f"Saved {filename}") - -Export to HDF5 -^^^^^^^^^^^^^^ - -HDF5 is suitable for image data. It is understood by most data analysis -software. - -.. note:: - - This example uses h5py. - - .. code:: - - conda install h5py - - # or... - - pip install h5py - -.. ipython:: python - - import h5py - - ds = run.primary.read() - columns = ["I0", "It", "Ir", "dcm_energy"] # columns we want to export - with h5py.File("data.h5", "w") as file: - for column in columns: - file[column] = df[column] - -Streaming Export ----------------- - -A tool built for streaming export can be used on both saved data (as we'll do -here) and on live-streaming data during data acquisition. - -.. note:: - - This example uses suitcase-csv. - - .. code:: - - conda install -c nsls2forge suitcase-csv - - # or... - - pip install suitcase-csv - -.. ipython:: python - :okexcept: - - import suitcase.csv - artifacts = suitcase.csv.export(run.documents(fill="yes"), "output_directory") - artifacts - -Note that this operates on the entire `run` and all of its streams. When a Run -contains multiple streams, multiple CSV files will be created. This is why it -acceps a path to a *directory* rather than a path to a single file. Any data -that does well-suited to the format (e.g. image data in this case) is omitted -for the export. - -See `Suitcase`_ for a list of supported formats and more information. - -.. _Suitcase: https://blueskyproject.io/suitcase - -.. ipython:: python - :suppress: - - # Clean up - !rm data.csv - !rm -rf Mn-spectra* - !rm data.xlsx - !rm data.h5 - !rm -rf output_directory diff --git a/docs/source/tutorials/get-data.rst b/docs/source/tutorials/get-data.rst deleted file mode 100644 index 7f4313bb6..000000000 --- a/docs/source/tutorials/get-data.rst +++ /dev/null @@ -1,285 +0,0 @@ -Get Data from a Run -=================== - -In this tutorial we will: - -* Load all the data from a small Run and do some basic math and visualization. -* Load and visualize just a slice of data from a 1 GB dataset, without loading - the whole dataset. - -Set up for Tutorial -------------------- - -Before you begin, install ``databroker`` and ``databroker-pack``, following the -:doc:`install`. - -Start your favorite interactive Python environment, such as ``ipython`` or -``jupyter lab``. - -For this tutorial, we'll use a catalog of publicly available, openly licensed -sample data. Specifically, it is high-quality transmission XAS data from all -over the periodical table. - -This utility downloads it and makes it discoverable to Databroker. - -.. ipython:: python - - import databroker.tutorial_utils - databroker.tutorial_utils.fetch_BMM_example() - -Access the catalog as assign it to a variable for convenience. - -.. ipython:: python - - import databroker - catalog = databroker.catalog['bluesky-tutorial-BMM'] - -Let's take a Run from this Catalog. - -.. ipython:: python - - run = catalog[23463] - -What's in the Run? ------------------- - -The Run's "pretty display", shown by IPython and Jupyter and some other -similar tools, shows us a summary. - -.. ipython:: python - - run - -Each run contains logical "tables" of data called *streams*. We can see them in -the summary above, and we iterate over them programmatically with a ``for`` -loop or with ``list``. - -.. ipython:: python - - list(run) - -Get the data ------------- - -Access a stream by name. This returns an `xarray`_ Dataset. - -.. ipython:: python - - ds = run.primary.read() - ds - -Access columns, as in ``ds["I0"]``. This returns an `xarray`_ DataArray. - -.. ipython:: python - - ds["I0"].head() # Just show the first couple elements. - -Do math on columns. - -.. ipython:: python - - normed = ds["I0"] / ds["It"] - normed.head() # Just show the first couple elements. - -Visualize them. There are couple ways to do this. - -.. code:: python - - # The plot() method on xarray.DataArray - ds["I0"].plot() - -.. plot:: - - import databroker - - run = databroker.catalog['bluesky-tutorial-BMM'][23463] - ds = run.primary.read() - ds["I0"].plot() - -.. code:: python - - # The plot accessor on xarray.Dataset - ds.plot.scatter(x="dcm_energy", y="I0") - - -.. plot:: - - import databroker - - run = databroker.catalog['bluesky-tutorial-BMM'][23463] - ds = run.primary.read() - - # The plot accessor on xarray.Dataset - ds.plot.scatter(x="dcm_energy", y="I0") - - -.. code:: python - - # Using matplotlib directly - import matplotlib.pyplot as plt - import numpy - - plt.plot(ds["dcm_energy"], numpy.log(ds["It"] / ds["I0"])) - plt.xlabel("dcm_energy") - plt.ylabel("log(It / I0)") - -.. plot:: - - import databroker - import matplotlib.pyplot as plt - import numpy - - run = databroker.catalog['bluesky-tutorial-BMM'][23463] - ds = run.primary.read() - - plt.plot(ds["dcm_energy"], numpy.log(ds["It"] / ds["I0"])) - plt.xlabel("dcm_energy") - plt.ylabel("log(It / I0)") - -These `xarray`_ DataArray objects bundle a numpy (or numpy-like) array with -some additional metadata and coordinates. To access the underlying array -directly, use the ``data`` attribute. - -.. ipython:: python - - type(ds["I0"]) - type(ds["I0"].data) - -Looking again at this Run - -.. ipython:: python - - run - -we see it has a second stream, "baseline". Reading that, we notice that columns -it contains, its dimensions, and its coordinates are different from the ones in -"primary". That's why it's in a different stream. The "baseline" stream is a -conventional name for snapshots taken at the very beginning and end of a -procedure. We see a long list of instruments with two data points each---before -and after. - -.. ipython:: python - - run.baseline.read() - -Different Runs can have different streams, but "primary" and "baseline" are the -two most common. - -With that, we have accessed all the data from this run. - -Handle large data ------------------ - -The example data we have been using so far has no large arrays in it. For this -section we will download a second Catalog with one Run in it that contains -image data. It's 1 GB (uncompressed), which is large enough to exercise the -tools involved. These same techniques scale to much larger datasets. - -The large arrays require an extra reader, which we can get from the package -``area-detector-handlers`` using pip on conda. - -.. code:: bash - - pip install area-detector-handlers - # or... - conda install -c nsls2forge area-detector-handlers - -Scientificaly, this is Resonant Soft X-ray Scattering (RSoXS) data. (`Details`_.) - -.. ipython:: python - - import databroker.tutorial_utils - databroker.tutorial_utils.fetch_RSOXS_example() - -Access the new Catalog and assign this Run to a variable. - -.. ipython:: python - - import databroker - run = databroker.catalog['bluesky-tutorial-RSOXS']['777b44a'] - -In the previous example, we used ``run.primary.read()`` at this point. That -method reads all the data from the "primary" stream from storage into memory. -This can be inconvenient if: - -1. The data is so large it does not all fit into memory (RAM) at once. Reading - it would prompt a ``MemoryError`` (best case) or cause Python to crash - (worst case). -2. You only need a subset of the data for your analysis. Reading all of it - would waste time. - -In these situations, we can summon up an `xarray`_ backed by *placeholders* -(`dask`_ arrays). These act like normal numpy arrays in many respects, but -internally they divide the data up intelligently into chunks. They only load -the each chunk if and when it is actually needed for a computation. - -.. ipython:: python - - lazy_ds = run.primary.to_dask() - -Comparing ``lazy_ds["Synced_waxs_image"].data`` to ``ds["I0"].data`` from the -previous section, we see that the "lazy" variant contains ```` -and the original contains ordinary numpy ``array``. - -.. ipython:: python - - ds["I0"].head().data # array - lazy_ds["Synced_waxs_image"].data # dask.array, a placeholder - -As an example of what's possible, we can subtract from this image series the -mean of an image series taken while the shutter was closed ("dark" images). - -.. ipython:: python - - corrected = run.primary.to_dask()["Synced_waxs_image"] - run.dark.to_dask()["Synced_waxs_image"].mean("time") - corrected - middle_image = corrected[64, 0, :, :] # Pull out a 2D slice. - middle_image - -At this point, *no data has yet been read*. We are still working with -placeholders, building up an expression of work to be done in the future. -Finally, when we plot it or otherwise hand it off to code that will treat it as -normal array, the data will be loaded and processed (in chunks) and finally -give us a normal numpy array as a result. When only a sub-slice of the data is -actually used---as is the case in this example---only the relevant chunk(s) -will ever be loaded. This can save a lot of time and memory. - -.. code:: python - - import matplotlib.pyplot as plt - from matplotlib.colors import LogNorm - - # Plot a slice from the middle as an image with a log-scaled color transfer. - plt.imshow(middle_image, norm=LogNorm(), origin='lower') - -.. plot:: - - import databroker - import matplotlib.pyplot as plt - from matplotlib.colors import LogNorm - - run = databroker.catalog['bluesky-tutorial-RSOXS']['777b44a'] - corrected = run.primary.to_dask()["Synced_waxs_image"] - run.dark.to_dask()["Synced_waxs_image"].mean("time") - middle_image = corrected[64, 0, :, :] # Pull out a 2D slice. - plt.imshow(middle_image, norm=LogNorm(), origin='lower') - -We can force that processing to happen explicitly by calling ``.compute()``. - -.. ipython:: python - - middle_image.compute() - -Notice that we now see ``array`` in there instead of -````. This is how we know that it's a normal array in memory, not a -placeholder for future work. - -For more, see the `xarray`_ documentation and the `dask`_ documentation. A good -entry point is the example covering `Dask Arrays`_. - -.. _xarray: https://xarray.pydata.org/ - -.. _dask: https://dask.org/ - -.. _Dask Arrays: https://examples.dask.org/array.html - -.. _Details: https://github.com/bluesky/data-samples/blob/master/catalogs/RSOXS/README.md diff --git a/docs/source/tutorials/get-metadata.rst b/docs/source/tutorials/get-metadata.rst deleted file mode 100644 index e585a960c..000000000 --- a/docs/source/tutorials/get-metadata.rst +++ /dev/null @@ -1,216 +0,0 @@ -Navigate Metadata in a Run -========================== - -In this tutorial we will access secondary measurements and metadata including: - -* Hardware configuration readings (e.g. exposure time) -* User-provided context like sample information -* Whether the Run completed with an error (and if so what error) -* Hardware-level timestamps for each measurement - -Set up for Tutorial -------------------- - -Before you begin, install ``databroker`` and ``databroker-pack``, following the -:doc:`install`. - -Start your favorite interactive Python environment, such as ``ipython`` or -``jupyter lab``. - -For this tutorial, we'll use a catalog of publicly available, openly licensed -sample data. Specifically, it is high-quality transmission XAS data from all -over the periodical table. - -This utility downloads it and makes it discoverable to Databroker. - -.. ipython:: python - - import databroker.tutorial_utils - databroker.tutorial_utils.fetch_BMM_example() - -Access the catalog and assign it to a variable for convenience. - -.. ipython:: python - - import databroker - catalog = databroker.catalog['bluesky-tutorial-BMM'] - -Let's take a Run from this Catalog. - -.. ipython:: python - - run = catalog[23463] - -(Hardware) Configuration ------------------------- - -The Run may include configurational readings necessary for interpreting the -data. These are typically things that change slowly or not at all during the -Run, like detector exposure time, detector gain settings, or the configured -maximum motor velocity. - -First, let's look at the ``I0`` readings in the ``primary`` stream. What are -the configuration readings that might be necessary to interpret this data or -compare it with other data? - -.. ipython:: python - - da = run.primary.read()["I0"] - da.head() - -This section at the bottom of that summary - -.. code:: - - Attributes: - object: quadem1 - -is showing us that ``I0`` was measured by the device ``quadem1``. We can also -access that programmatically like - -.. ipython:: python - - da.attrs.get("object") - -We can then look up all the configuration readings associated with ``quadem1`` -in this stream. - -.. ipython:: python - - run.primary.config["quadem1"].read() - -If another Run ran the ``quadem1`` detector with a *different* integration -time, we could use this information to normalize the readings and compare them -accurately. - -TO DO: Get an example of that. - -Let's look at some other readings in the dataset. The ``It`` also comes from -``quadem1``, so those same configuration readings apply. - -.. ipython:: python - - ds["It"].attrs - -The ``dcm_energy``readings, on the other hand, comes from a different device, -which happens to also be named ``dcm_energy``. - -.. ipython:: python - - ds["dcm_energy"].attrs - -We can see that no configuration was recorded for that device. - -.. ipython:: python - - run.primary.config["dcm_energy"].read() - -How It Started --------------- - -There are many useful pieces of metadata that we know at the **start**, before -we begin acquiring data or running data processing/analysis. This includes what -we intend to do (i.e. which scan type or which data processing routine), who is -doing it, and any additional context like sample information. - -The only fields *guaranteed* by Databroker to be present are ``uid`` (a -globally unique identifier for the Run) and ``time`` (when it started) but -there is often a great deal more. - -.. code:: python - - >>> run.metadata["start"] - Start({ - 'XDI': {'Beamline': {'collimation': 'paraboloid mirror, 5 nm Rh on 30 nm Pt', - 'focusing': 'not in use', - 'harmonic_rejection': 'flat mirror, Pt stripe, pitch = ' - '7.0 mrad relative to beam', - 'name': 'BMM (06BM) -- Beamline for Materials ' - 'Measurement', - 'xray_source': 'NSLS-II three-pole wiggler'}, - 'Column': {}, - 'Detector': {'I0': '10 cm N2', 'Ir': '25 cm N2', 'It': '25 cm N2'}, - 'Element': {'edge': 'K', 'symbol': 'Ni'}, - 'Facility': {'GUP': 305832, - 'SAF': 305669, - 'current': '399.6', - 'cycle': '2020-1', - 'energy': '3.0', - 'mode': 'top-off', - 'name': 'NSLS-II'}, - 'Mono': {'angle_offset': 16.058109, - 'd_spacing': '3.1353241', - 'direction': 'forward', - 'encoder_resolution': 5e-06, - 'name': 'Si(111)', - 'scan_mode': 'fixed exit', - 'scan_type': 'step'}, - 'Sample': {'name': 'Ni', 'prep': 'Ni foil in ref'}, - 'Scan': {'edge_energy': 8332.800000000001, - 'experimenters': 'Neil Hyatt, Martin Stennett, Dan Austin, ' - 'Seb Lawson'}, - - ... # snipped for brevity - } - -How It Ended ------------- - -There are other things we can only know at the **stop** (end) of an experiment, -including when and how it finished and how many events (rows) of data were -collected in each stream. - -.. ipython:: python - - run.metadata["stop"] - -We can use this to print the unique IDs of any experiments that failed - -.. ipython:: python - - for uid, run in catalog.items(): - if run.metadata["stop"]["exit_status"] != "success": - print(f"Run {uid} failed!") - -or, getting a bit fancier, to tally the number of failures. - -.. ipython:: python - - from collections import Counter - - counter = Counter() - for _, run in catalog.items(): - counter.update({run.metadata["stop"]["exit_status"]: 1}) - counter - -TO DO: Obtain an example catalog that has some failures in it so that this -example is not so trivial. - -Low-level Hardware Timestamps ------------------------------ - -.. note:: - - Any *preicse* timing measurements should be in the data itself, not in this - supplemental hardware timestamp metadata. This should generally be - considered good for ~0.1 second precision alignment. - -Control systems provide us with individually timestamps for every reading. -These should generally *not* be used for data analysis. Any timing readings -necessary for analysis should be recorded as data, as a column in some stream. -These are intended to be used for debugging and troubleshooting. - -The timestamps associated with the readings in ``run.primary.read()`` are -available as - -.. ipython:: python - - run.primary.timestamps.read() - -Configuration readings also come with timestamps. The timestamps associated -with the configuration readings in ``run.primary.config["quadem1"].read()`` are -available as - -.. ipython:: python - - run.primary.config_timestamps["quadem1"].read() diff --git a/docs/source/tutorials/install.rst b/docs/source/tutorials/install.rst deleted file mode 100644 index afc6de994..000000000 --- a/docs/source/tutorials/install.rst +++ /dev/null @@ -1,67 +0,0 @@ -Installation Tutorial -===================== - -This tutorial covers - -* Installation using conda -* Installation using pip -* Installation from source - -Conda ------ - -We strongly recommend creating a fresh environment. - -.. code:: bash - - conda create -n try-databroker - conda activate try-databroker - -Install Databroker from the ``nsls2forge`` conda channel maintained by NSLS-II. - -.. code:: bash - - conda install -c nsls2forge databroker - -To follow the along with the tutorials, you will also need -``databroker-pack``. - - -.. code:: bash - - conda install -c nsls2forge databroker-pack - -Pip ---- - -We strongly recommend creating a fresh environment. - -.. code:: bash - - python3 -m venv try-databroker - source try-databroker/bin/activate - -Install Databroker from PyPI. - -.. code:: bash - - python3 -m pip install databroker - -To follow the along with the tutorials, you will also need -``databroker-pack``. - -.. code:: bash - - python3 -m pip install databroker-pack - - -Source ------- - -To install an editable installation for local development: - -.. code:: bash - - git clone https://github.com/bluesky/databroker - cd databroker - pip install -e . diff --git a/docs/source/tutorials/search-and-lookup.rst b/docs/source/tutorials/search-and-lookup.rst deleted file mode 100644 index 6c95d98a3..000000000 --- a/docs/source/tutorials/search-and-lookup.rst +++ /dev/null @@ -1,196 +0,0 @@ -.. currentmodule:: databroker - -Find Runs in a Catalog -====================== - -In this tutorial we will: - -* Look up a specific Run by some identifier. -* Look up a specific Run based on recency (e.g. "Show me the data I just took"). -* Search for Runs using both simple and complex search queries. - -Set up for Tutorial -------------------- - -Before you begin, install ``databroker`` and ``databroker-pack``, following the -:doc:`install`. - -Start your favorite interactive Python environment, such as ``ipython`` or -``jupyter lab``. - -For this tutorial, we'll use a catalog of publicly available, openly licensed -sample data. Specifically, it is high-quality transmission XAS data from all -over the periodical table. - -This utility downloads it and makes it discoverable to Databroker. - -.. ipython:: python - - import databroker.tutorial_utils - databroker.tutorial_utils.fetch_BMM_example() - -Access the catalog and assign it to a variable for convenience. - -.. ipython:: python - - import databroker - catalog = databroker.catalog['bluesky-tutorial-BMM'] - -Look-up -------- - -In this section we will look up a Run by its - -* Globally unique identifier --- unmemorable, but great for scripts -* Counting-number "scan ID" --- easier to remember, but not necessarily unique -* Recency --- e.g. "the data I just took" - -If you know exactly which Run you are looking for, the surest way to get it is -to look it up by its globally unique identifier, its "uid". This is the -recommended way to look up runs *in scripts* but it is not especially -fluid for interactive use. - -.. ipython:: python - - catalog['c07e765b-ce5c-4c75-a16e-06f66546c1d4'] - -The uid may be abbreviated. The first 7 or 8 characters are usually sufficient -to uniquely identify an entry. - -.. ipython:: python - - catalog['c07e765'] - -If the abbreviated uid is ambiguous---if it matches more than one Run---a -``ValueError`` is raised listing the matches. Try ``catalog['a']``, which will -match two Runs in this Catalog and raise that error. - -Runs typically also have a counting number identifier, dubbed ``scan_id``. This -is easier to remember. Keep in mind that ``scan_id`` *is not neccesarily unique*, -and Databroker will always give you the most recent match. -Some users are in the habit of resetting ``scan_id`` to 1 at the beginning of -a new experiment or operating cycle. This is why lookup based on the globally -unique identifier is safest for scripts and Jupyter notebooks, especially -long-lived ones. - -.. ipython:: python - - catalog[23463] - -Finally, it is often convenient to access data by recency, as in "the data that -I just took". - -.. ipython:: python - - catalog[-1] - -This syntax is meant to feel similar to accessing elements in a list or array -in Python, where ``a[-N]`` means "``N`` elements from the end of ``a``". - -In summary: - -================== =============================================== -``catalog["..."]`` Globally unique identifier ("uid") -``catalog[N]`` Counting number "scan ID" N (most recent match) -``catalog[-N]`` Nth most recent Run in the Catalog -================== =============================================== - -All of these always return *one* ``BlueskyRun`` or raise an exception. - -Search ------- - -Common search queries can be done with a high-level Python interface. - -.. ipython:: python - :okwarning: - - from databroker.queries import TimeRange - - results = catalog.search(TimeRange(since="2020-03-05")) - -The result of a search is just another Catalog. It has a subset of the original -Catalog's entries. We can compare the number of search results to the total -number of entries in ``catalog``. - -.. ipython:: python - - print(f"Results: {len(results)} Total: {len(catalog)}") - -We can iterate through the results for batch processing - -.. ipython:: python - - for uid, run in results.items(): - # Do something. - ... - -or access a particular result by using any of the lookup methods in the section -above, such as recency. This is a convenient way to quickly look at one search -result. - -.. ipython:: python - - results[-1] - -Because ``results`` is just another Catalog, we can search on the search -results to progressively narrow our results. - -.. ipython:: python - - narrowed_results = results.search({"num_points": {"$gt": 400}}) # Read on... - print(f"Narrowed Results: {len(narrowed_results)} Results: {len(results)} Total: {len(catalog)}") - -Custom queries can be done with the `MongoDB query language`_. -The simplest examples check for equality of a key and value, as in - -.. ipython:: python - - results = catalog.search({"XDI.Element.symbol": "Mn"}) - len(results) - -The above matches Runs where the 'start' document looks like:: - - { - ... - "XDI": {"Element": {"symbol": "Mn"}}, - ... - } - -The allowed keys are totally open-ended as far as Databroker is concerned. -This example is particular to the metadata recorded by the instrument that -it came from. What's useful in your case will depend on what metadata was -provided when the data was captured. Look at a couple Runs' start documents -to get a sense of the metadata that would be useful in searches. - -.. code:: python - - run = catalog[-1] - run.metadata["start"] - -Again, the syntax of a query is that of the `MongoDB query language`_. -It's an expressive language for specifying searches over heterogeneous -metadata. - -.. note:: - - When the data is stored by some means other than MongoDB, databroker uses - Python libraries that support most of MongoDB's query language without - actual MongoDB. - -Here is an example of a more sophisticated query, doing more than just checking -for equality. - -.. ipython:: python - - query = { - "XDI.Scan.edge_energy": {"$lte": 6539.0}, # less than or equal to - "XDI.Element.symbol": "Mn", - } - results = catalog.search(query) - len(results) - -See the MongoDB documentation linked above to learn other expressions like -``$lte``. - -.. _MongoDB query language: https://docs.mongodb.com/manual/reference/operator/query/