From 98f7d4d36cc33b5a96fe356e9d78d29eba9050e9 Mon Sep 17 00:00:00 2001
From: Ryan Abernathey <ryan.abernathey@gmail.com>
Date: Thu, 29 Apr 2021 18:41:09 -0400
Subject: [PATCH 01/21] started refactor; not working

---
 xhistogram/core.py | 111 ++++++++++++++++++++++++---------------------
 1 file changed, 59 insertions(+), 52 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 819b7fb..8314fa5 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 from functools import reduce
-from .duck_array_ops import (
+from numpy import (
     digitize,
     bincount,
     reshape,
@@ -146,6 +146,59 @@ def _histogram_2d_vectorized(
     return bin_counts
 
 
+def _actual_histogram(all_arrays, weights, axis, bins, density):
+
+    all_arrays_broadcast = broadcast_arrays(*all_arrays)
+    a0 = all_arrays_broadcast[0]
+    ndim = a0.ndim
+
+    do_full_array = (axis is None) or (set(axis) == set(range(a0.ndim)))
+    if do_full_array:
+        kept_axes_shape = None
+    else:
+        kept_axes_shape = tuple([a0.shape[i] for i in range(a0.ndim) if i not in axis])
+
+    def reshape_input(a):
+        if do_full_array:
+            d = a.ravel()[None, :]
+        else:
+            # reshape the array to 2D
+            # axis 0: preserved axis after histogram
+            # axis 1: calculate histogram along this axis
+            new_pos = tuple(range(-len(axis), 0))
+            c = np.moveaxis(a, axis, new_pos)
+            split_idx = c.ndim - len(axis)
+            dims_0 = c.shape[:split_idx]
+            assert dims_0 == kept_axes_shape
+            dims_1 = c.shape[split_idx:]
+            new_dim_0 = np.prod(dims_0)
+            new_dim_1 = np.prod(dims_1)
+            d = reshape(c, (new_dim_0, new_dim_1))
+        return d
+
+    all_arrays_reshaped = [reshape_input(a) for a in all_arrays_broadcast]
+
+    if weights is not None:
+        weights_reshaped = all_arrays_reshaped.pop()
+    else:
+        weights_reshaped = None
+
+    bin_counts = _histogram_2d_vectorized(
+        *all_arrays_reshaped,
+        bins=bins,
+        weights=weights_reshaped,
+        density=density
+    )
+
+    if bin_counts.shape[0] == 1:
+        assert do_full_array
+        bin_counts = bin_counts.squeeze()
+    else:
+        final_shape = kept_axes_shape + bin_counts.shape[1:]
+        bin_counts = reshape(bin_counts, final_shape)
+
+    return bin_counts
+
 def histogram(
     *args, bins=None, axis=None, weights=None, density=False, block_size="auto"
 ):
@@ -222,54 +275,15 @@ def histogram(
                 ax_positive = ndim + ax
             assert ax_positive < ndim, "axis must be less than ndim"
             axis_normed.append(ax_positive)
-        axis = np.atleast_1d(axis_normed)
-
-    do_full_array = (axis is None) or (set(axis) == set(range(a0.ndim)))
-    if do_full_array:
-        kept_axes_shape = None
-    else:
-        kept_axes_shape = tuple([a0.shape[i] for i in range(a0.ndim) if i not in axis])
+        axis = [int(i) for i in axis_normed]
 
-    all_args = list(args)
-    if weights is not None:
-        all_args += [weights]
-    all_args_broadcast = broadcast_arrays(*all_args)
+    all_arrays = list(args)
 
-    def reshape_input(a):
-        if do_full_array:
-            d = a.ravel()[None, :]
-        else:
-            # reshape the array to 2D
-            # axis 0: preserved axis after histogram
-            # axis 1: calculate histogram along this axis
-            new_pos = tuple(range(-len(axis), 0))
-            c = np.moveaxis(a, axis, new_pos)
-            split_idx = c.ndim - len(axis)
-            dims_0 = c.shape[:split_idx]
-            assert dims_0 == kept_axes_shape
-            dims_1 = c.shape[split_idx:]
-            new_dim_0 = np.prod(dims_0)
-            new_dim_1 = np.prod(dims_1)
-            d = reshape(c, (new_dim_0, new_dim_1))
-        return d
-
-    all_args_reshaped = [reshape_input(a) for a in all_args_broadcast]
-
-    if weights is not None:
-        weights_reshaped = all_args_reshaped.pop()
-    else:
-        weights_reshaped = None
-
-    n_inputs = len(all_args_reshaped)
+    n_inputs = len(all_arrays)
     bins = _ensure_bins_is_a_list_of_arrays(bins, n_inputs)
 
-    bin_counts = _histogram_2d_vectorized(
-        *all_args_reshaped,
-        bins=bins,
-        weights=weights_reshaped,
-        density=density,
-        block_size=block_size,
-    )
+    print(axis)
+    bin_counts = _actual_histogram(all_arrays, weights, axis, bins, density)
 
     if density:
         # Normalise by dividing by bin counts and areas such that all the
@@ -287,11 +301,4 @@ def reshape_input(a):
     else:
         h = bin_counts
 
-    if h.shape[0] == 1:
-        assert do_full_array
-        h = h.squeeze()
-    else:
-        final_shape = kept_axes_shape + h.shape[1:]
-        h = reshape(h, final_shape)
-
     return h

From 035636538fc9d92cc259cde28be723c1f3dda718 Mon Sep 17 00:00:00 2001
From: Ryan Abernathey <ryan.abernathey@gmail.com>
Date: Thu, 29 Apr 2021 21:47:18 -0400
Subject: [PATCH 02/21] refactored a bit; tests mostly pass

---
 xhistogram/core.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 8314fa5..06afb3f 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -148,7 +148,15 @@ def _histogram_2d_vectorized(
 
 def _actual_histogram(all_arrays, weights, axis, bins, density):
 
+    if weights is not None:
+        all_arrays.append(weights)
+
     all_arrays_broadcast = broadcast_arrays(*all_arrays)
+
+    if weights is not None:
+        weights_broadcast = all_arrays_broadcast.pop()
+    else:
+        weights_broadcast = None
     a0 = all_arrays_broadcast[0]
     ndim = a0.ndim
 
@@ -178,8 +186,8 @@ def reshape_input(a):
 
     all_arrays_reshaped = [reshape_input(a) for a in all_arrays_broadcast]
 
-    if weights is not None:
-        weights_reshaped = all_arrays_reshaped.pop()
+    if weights_broadcast is not None:
+        weights_reshaped = reshape_input(weights_broadcast)
     else:
         weights_reshaped = None
 
@@ -282,7 +290,6 @@ def histogram(
     n_inputs = len(all_arrays)
     bins = _ensure_bins_is_a_list_of_arrays(bins, n_inputs)
 
-    print(axis)
     bin_counts = _actual_histogram(all_arrays, weights, axis, bins, density)
 
     if density:

From dca46212374e0d4ab4e56fc96aead6eb61c44f03 Mon Sep 17 00:00:00 2001
From: Ryan Abernathey <ryan.abernathey@gmail.com>
Date: Thu, 29 Apr 2021 22:22:02 -0400
Subject: [PATCH 03/21] got to the hard part

---
 xhistogram/core.py           | 30 +++++++++++++++++++++++++-----
 xhistogram/duck_array_ops.py |  6 ++++++
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 06afb3f..105b57f 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -13,7 +13,7 @@
     concatenate,
     broadcast_arrays,
 )
-
+from .duck_array_ops import _any_dask_array
 
 def _ensure_bins_is_a_list_of_arrays(bins, N_expected):
     if len(bins) == N_expected:
@@ -88,7 +88,7 @@ def _dispatch_bincount(bin_indices, weights, N, hist_shapes, block_size=None):
         return _bincount_loop(bin_indices, weights, N, hist_shapes, block_chunks)
 
 
-def _histogram_2d_vectorized(
+def _bincount_2d_vectorized(
     *args, bins=None, weights=None, density=False, right=False, block_size=None
 ):
     """Calculate the histogram independently on each row of a 2D array"""
@@ -146,7 +146,7 @@ def _histogram_2d_vectorized(
     return bin_counts
 
 
-def _actual_histogram(all_arrays, weights, axis, bins, density):
+def _bincount(all_arrays, weights, axis, bins, density):
 
     if weights is not None:
         all_arrays.append(weights)
@@ -191,7 +191,7 @@ def reshape_input(a):
     else:
         weights_reshaped = None
 
-    bin_counts = _histogram_2d_vectorized(
+    bin_counts = _bincount_2d_vectorized(
         *all_arrays_reshaped,
         bins=bins,
         weights=weights_reshaped,
@@ -290,7 +290,27 @@ def histogram(
     n_inputs = len(all_arrays)
     bins = _ensure_bins_is_a_list_of_arrays(bins, n_inputs)
 
-    bin_counts = _actual_histogram(all_arrays, weights, axis, bins, density)
+    if _any_dask_array(weights, *all_arrays):
+        raise NotImplementedError("No dask allowed for now")
+        # We should be able to just apply the bin_count function to every
+        # block and then sum over all blocks to get the total bin count.
+        # The main challenge is to figure out the chunk shape that will come
+        # out of _bincount. We might also need to add dummy dimensions to sum
+        # over in the _bincount function
+        import dask.array as dsa
+        bin_counts = dsa.map_blocks(
+            _bincount,
+            all_arrays, weights, axis, bins, density,
+            drop_axis=axis,
+            new_axis="what?",
+            chunks="what?",
+            meta=np.array((), dtype=np.int64)
+        )
+        # sum over the block dims
+        block_dims = "?"
+        bin_counts = bin_counts.sum(block_dims)
+    else:
+        bin_counts = _bincount(all_arrays, weights, axis, bins, density)
 
     if density:
         # Normalise by dividing by bin counts and areas such that all the
diff --git a/xhistogram/duck_array_ops.py b/xhistogram/duck_array_ops.py
index b9e632e..b049252 100644
--- a/xhistogram/duck_array_ops.py
+++ b/xhistogram/duck_array_ops.py
@@ -10,6 +10,12 @@
 except ImportError:
     has_dask = False
 
+def _any_dask_array(*args):
+    if not has_dask:
+        return False
+    else:
+        import dask.array as dsa
+        return any([isinstance(a, dsa.core.Array) for a in args])
 
 def _dask_or_eager_func(name, eager_module=np, list_of_args=False, n_array_args=1):
     """Create a function that dispatches to dask for dask array inputs."""

From b0fdbdf7dab46ffb3e91511f2062d1cf45ff9cce Mon Sep 17 00:00:00 2001
From: Ryan Abernathey <ryan.abernathey@gmail.com>
Date: Thu, 29 Apr 2021 22:23:09 -0400
Subject: [PATCH 04/21] got to the hard part

---
 versioneer.py                | 267 ++++++++++++++++++++++-------------
 xhistogram/core.py           |  16 ++-
 xhistogram/duck_array_ops.py |   3 +
 3 files changed, 178 insertions(+), 108 deletions(-)

diff --git a/versioneer.py b/versioneer.py
index 64fea1c..2b54540 100644
--- a/versioneer.py
+++ b/versioneer.py
@@ -1,4 +1,3 @@
-
 # Version: 0.18
 
 """The Versioneer - like a rocketeer, but for versions.
@@ -277,6 +276,7 @@
 """
 
 from __future__ import print_function
+
 try:
     import configparser
 except ImportError:
@@ -308,11 +308,13 @@ def get_root():
         setup_py = os.path.join(root, "setup.py")
         versioneer_py = os.path.join(root, "versioneer.py")
     if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
-        err = ("Versioneer was unable to run the project root directory. "
-               "Versioneer requires setup.py to be executed from "
-               "its immediate directory (like 'python setup.py COMMAND'), "
-               "or in a way that lets it use sys.argv[0] to find the root "
-               "(like 'python path/to/setup.py COMMAND').")
+        err = (
+            "Versioneer was unable to run the project root directory. "
+            "Versioneer requires setup.py to be executed from "
+            "its immediate directory (like 'python setup.py COMMAND'), "
+            "or in a way that lets it use sys.argv[0] to find the root "
+            "(like 'python path/to/setup.py COMMAND')."
+        )
         raise VersioneerBadRootError(err)
     try:
         # Certain runtime workflows (setup.py install/develop in a setuptools
@@ -325,8 +327,10 @@ def get_root():
         me_dir = os.path.normcase(os.path.splitext(me)[0])
         vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
         if me_dir != vsr_dir:
-            print("Warning: build in %s is using versioneer.py from %s"
-                  % (os.path.dirname(me), versioneer_py))
+            print(
+                "Warning: build in %s is using versioneer.py from %s"
+                % (os.path.dirname(me), versioneer_py)
+            )
     except NameError:
         pass
     return root
@@ -348,6 +352,7 @@ def get(parser, name):
         if parser.has_option("versioneer", name):
             return parser.get("versioneer", name)
         return None
+
     cfg = VersioneerConfig()
     cfg.VCS = VCS
     cfg.style = get(parser, "style") or ""
@@ -372,17 +377,18 @@ class NotThisMethod(Exception):
 
 def register_vcs_handler(vcs, method):  # decorator
     """Decorator to mark a method as the handler for a particular VCS."""
+
     def decorate(f):
         """Store f in HANDLERS[vcs][method]."""
         if vcs not in HANDLERS:
             HANDLERS[vcs] = {}
         HANDLERS[vcs][method] = f
         return f
+
     return decorate
 
 
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
     """Call the given command(s)."""
     assert isinstance(commands, list)
     p = None
@@ -390,10 +396,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
         try:
             dispcmd = str([c] + args)
             # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
+            p = subprocess.Popen(
+                [c] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+            )
             break
         except EnvironmentError:
             e = sys.exc_info()[1]
@@ -418,7 +427,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
     return stdout, p.returncode
 
 
-LONG_VERSION_PY['git'] = '''
+LONG_VERSION_PY[
+    "git"
+] = '''
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build
@@ -993,7 +1004,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
     TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
     if not tags:
         # Either we're using git < 1.8.3, or there really are no tags. We use
         # a heuristic: assume all version tags have a digit. The old git %d
@@ -1002,7 +1013,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
         # between branches and tags. By ignoring refnames without digits, we
         # filter out many common branch names like "release" and
         # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
+        tags = set([r for r in refs if re.search(r"\d", r)])
         if verbose:
             print("discarding '%s', no digits" % ",".join(refs - tags))
     if verbose:
@@ -1010,19 +1021,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     for ref in sorted(tags):
         # sorting will prefer e.g. "2.0" over "2.0rc1"
         if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
+            r = ref[len(tag_prefix) :]
             if verbose:
                 print("picking %s" % r)
-            return {"version": r,
-                    "full-revisionid": keywords["full"].strip(),
-                    "dirty": False, "error": None,
-                    "date": date}
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
     # no suitable tags, so version is "0+unknown", but full hex is still there
     if verbose:
         print("no suitable tags, using unknown + full revision id")
-    return {"version": "0+unknown",
-            "full-revisionid": keywords["full"].strip(),
-            "dirty": False, "error": "no suitable tags", "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
 
 
 @register_vcs_handler("git", "pieces_from_vcs")
@@ -1037,8 +1055,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     if sys.platform == "win32":
         GITS = ["git.cmd", "git.exe"]
 
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
     if rc != 0:
         if verbose:
             print("Directory %s not under git control" % root)
@@ -1046,10 +1063,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
 
     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
     # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%s*" % tag_prefix],
-                                   cwd=root)
+    describe_out, rc = run_command(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            "%s*" % tag_prefix,
+        ],
+        cwd=root,
+    )
     # --long was added in git-1.5.5
     if describe_out is None:
         raise NotThisMethod("'git describe' failed")
@@ -1072,17 +1098,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     dirty = git_describe.endswith("-dirty")
     pieces["dirty"] = dirty
     if dirty:
-        git_describe = git_describe[:git_describe.rindex("-dirty")]
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
 
     # now we have TAG-NUM-gHEX or HEX
 
     if "-" in git_describe:
         # TAG-NUM-gHEX
-        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
         if not mo:
             # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = ("unable to parse git-describe output: '%s'"
-                               % describe_out)
+            pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
             return pieces
 
         # tag
@@ -1091,10 +1116,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
             if verbose:
                 fmt = "tag '%s' doesn't start with prefix '%s'"
                 print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
-                               % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
             return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
 
         # distance: number of commits since tag
         pieces["distance"] = int(mo.group(2))
@@ -1105,13 +1132,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     else:
         # HEX: no tags
         pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
         pieces["distance"] = int(count_out)  # total number of commits
 
     # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
-                       cwd=root)[0].strip()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
     pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
 
     return pieces
@@ -1167,16 +1194,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     for i in range(3):
         dirname = os.path.basename(root)
         if dirname.startswith(parentdir_prefix):
-            return {"version": dirname[len(parentdir_prefix):],
-                    "full-revisionid": None,
-                    "dirty": False, "error": None, "date": None}
+            return {
+                "version": dirname[len(parentdir_prefix) :],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
         else:
             rootdirs.append(root)
             root = os.path.dirname(root)  # up a level
 
     if verbose:
-        print("Tried directories %s but none started with prefix %s" %
-              (str(rootdirs), parentdir_prefix))
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
     raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
 
 
@@ -1205,11 +1238,13 @@ def versions_from_file(filename):
             contents = f.read()
     except EnvironmentError:
         raise NotThisMethod("unable to read _version.py")
-    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
-                   contents, re.M | re.S)
+    mo = re.search(
+        r"version_json = '''\n(.*)'''  # END VERSION_JSON", contents, re.M | re.S
+    )
     if not mo:
-        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
-                       contents, re.M | re.S)
+        mo = re.search(
+            r"version_json = '''\r\n(.*)'''  # END VERSION_JSON", contents, re.M | re.S
+        )
     if not mo:
         raise NotThisMethod("no version_json in _version.py")
     return json.loads(mo.group(1))
@@ -1218,8 +1253,7 @@ def versions_from_file(filename):
 def write_to_version_file(filename, versions):
     """Write the given version number to the given _version.py file."""
     os.unlink(filename)
-    contents = json.dumps(versions, sort_keys=True,
-                          indent=1, separators=(",", ": "))
+    contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": "))
     with open(filename, "w") as f:
         f.write(SHORT_VERSION_PY % contents)
 
@@ -1251,8 +1285,7 @@ def render_pep440(pieces):
                 rendered += ".dirty"
     else:
         # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
-                                          pieces["short"])
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
         if pieces["dirty"]:
             rendered += ".dirty"
     return rendered
@@ -1366,11 +1399,13 @@ def render_git_describe_long(pieces):
 def render(pieces, style):
     """Render the given version pieces into the requested style."""
     if pieces["error"]:
-        return {"version": "unknown",
-                "full-revisionid": pieces.get("long"),
-                "dirty": None,
-                "error": pieces["error"],
-                "date": None}
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
 
     if not style or style == "default":
         style = "pep440"  # the default
@@ -1390,9 +1425,13 @@ def render(pieces, style):
     else:
         raise ValueError("unknown style '%s'" % style)
 
-    return {"version": rendered, "full-revisionid": pieces["long"],
-            "dirty": pieces["dirty"], "error": None,
-            "date": pieces.get("date")}
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
 
 
 class VersioneerBadRootError(Exception):
@@ -1415,8 +1454,9 @@ def get_versions(verbose=False):
     handlers = HANDLERS.get(cfg.VCS)
     assert handlers, "unrecognized VCS '%s'" % cfg.VCS
     verbose = verbose or cfg.verbose
-    assert cfg.versionfile_source is not None, \
-        "please set versioneer.versionfile_source"
+    assert (
+        cfg.versionfile_source is not None
+    ), "please set versioneer.versionfile_source"
     assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
 
     versionfile_abs = os.path.join(root, cfg.versionfile_source)
@@ -1470,9 +1510,13 @@ def get_versions(verbose=False):
     if verbose:
         print("unable to compute version")
 
-    return {"version": "0+unknown", "full-revisionid": None,
-            "dirty": None, "error": "unable to compute version",
-            "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
 
 
 def get_version():
@@ -1521,6 +1565,7 @@ def run(self):
             print(" date: %s" % vers.get("date"))
             if vers["error"]:
                 print(" error: %s" % vers["error"])
+
     cmds["version"] = cmd_version
 
     # we override "build_py" in both distutils and setuptools
@@ -1553,14 +1598,15 @@ def run(self):
             # now locate _version.py in the new build/ directory and replace
             # it with an updated value
             if cfg.versionfile_build:
-                target_versionfile = os.path.join(self.build_lib,
-                                                  cfg.versionfile_build)
+                target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build)
                 print("UPDATING %s" % target_versionfile)
                 write_to_version_file(target_versionfile, versions)
+
     cmds["build_py"] = cmd_build_py
 
     if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
         from cx_Freeze.dist import build_exe as _build_exe
+
         # nczeczulin reports that py2exe won't like the pep440-style string
         # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
         # setup(console=[{
@@ -1581,17 +1627,21 @@ def run(self):
                 os.unlink(target_versionfile)
                 with open(cfg.versionfile_source, "w") as f:
                     LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(LONG %
-                            {"DOLLAR": "$",
-                             "STYLE": cfg.style,
-                             "TAG_PREFIX": cfg.tag_prefix,
-                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                             })
+                    f.write(
+                        LONG
+                        % {
+                            "DOLLAR": "$",
+                            "STYLE": cfg.style,
+                            "TAG_PREFIX": cfg.tag_prefix,
+                            "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                            "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        }
+                    )
+
         cmds["build_exe"] = cmd_build_exe
         del cmds["build_py"]
 
-    if 'py2exe' in sys.modules:  # py2exe enabled?
+    if "py2exe" in sys.modules:  # py2exe enabled?
         try:
             from py2exe.distutils_buildexe import py2exe as _py2exe  # py3
         except ImportError:
@@ -1610,13 +1660,17 @@ def run(self):
                 os.unlink(target_versionfile)
                 with open(cfg.versionfile_source, "w") as f:
                     LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(LONG %
-                            {"DOLLAR": "$",
-                             "STYLE": cfg.style,
-                             "TAG_PREFIX": cfg.tag_prefix,
-                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                             })
+                    f.write(
+                        LONG
+                        % {
+                            "DOLLAR": "$",
+                            "STYLE": cfg.style,
+                            "TAG_PREFIX": cfg.tag_prefix,
+                            "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                            "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        }
+                    )
+
         cmds["py2exe"] = cmd_py2exe
 
     # we override different "sdist" commands for both environments
@@ -1643,8 +1697,10 @@ def make_release_tree(self, base_dir, files):
             # updated value
             target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
             print("UPDATING %s" % target_versionfile)
-            write_to_version_file(target_versionfile,
-                                  self._versioneer_generated_versions)
+            write_to_version_file(
+                target_versionfile, self._versioneer_generated_versions
+            )
+
     cmds["sdist"] = cmd_sdist
 
     return cmds
@@ -1699,11 +1755,13 @@ def do_setup():
     root = get_root()
     try:
         cfg = get_config_from_root(root)
-    except (EnvironmentError, configparser.NoSectionError,
-            configparser.NoOptionError) as e:
+    except (
+        EnvironmentError,
+        configparser.NoSectionError,
+        configparser.NoOptionError,
+    ) as e:
         if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
-            print("Adding sample versioneer config to setup.cfg",
-                  file=sys.stderr)
+            print("Adding sample versioneer config to setup.cfg", file=sys.stderr)
             with open(os.path.join(root, "setup.cfg"), "a") as f:
                 f.write(SAMPLE_CONFIG)
         print(CONFIG_ERROR, file=sys.stderr)
@@ -1712,15 +1770,18 @@ def do_setup():
     print(" creating %s" % cfg.versionfile_source)
     with open(cfg.versionfile_source, "w") as f:
         LONG = LONG_VERSION_PY[cfg.VCS]
-        f.write(LONG % {"DOLLAR": "$",
-                        "STYLE": cfg.style,
-                        "TAG_PREFIX": cfg.tag_prefix,
-                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                        })
-
-    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
-                       "__init__.py")
+        f.write(
+            LONG
+            % {
+                "DOLLAR": "$",
+                "STYLE": cfg.style,
+                "TAG_PREFIX": cfg.tag_prefix,
+                "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                "VERSIONFILE_SOURCE": cfg.versionfile_source,
+            }
+        )
+
+    ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py")
     if os.path.exists(ipy):
         try:
             with open(ipy, "r") as f:
@@ -1762,8 +1823,10 @@ def do_setup():
     else:
         print(" 'versioneer.py' already in MANIFEST.in")
     if cfg.versionfile_source not in simple_includes:
-        print(" appending versionfile_source ('%s') to MANIFEST.in" %
-              cfg.versionfile_source)
+        print(
+            " appending versionfile_source ('%s') to MANIFEST.in"
+            % cfg.versionfile_source
+        )
         with open(manifest_in, "a") as f:
             f.write("include %s\n" % cfg.versionfile_source)
     else:
diff --git a/xhistogram/core.py b/xhistogram/core.py
index 105b57f..791d3b8 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -15,6 +15,7 @@
 )
 from .duck_array_ops import _any_dask_array
 
+
 def _ensure_bins_is_a_list_of_arrays(bins, N_expected):
     if len(bins) == N_expected:
         return bins
@@ -192,10 +193,7 @@ def reshape_input(a):
         weights_reshaped = None
 
     bin_counts = _bincount_2d_vectorized(
-        *all_arrays_reshaped,
-        bins=bins,
-        weights=weights_reshaped,
-        density=density
+        *all_arrays_reshaped, bins=bins, weights=weights_reshaped, density=density
     )
 
     if bin_counts.shape[0] == 1:
@@ -207,6 +205,7 @@ def reshape_input(a):
 
     return bin_counts
 
+
 def histogram(
     *args, bins=None, axis=None, weights=None, density=False, block_size="auto"
 ):
@@ -298,13 +297,18 @@ def histogram(
         # out of _bincount. We might also need to add dummy dimensions to sum
         # over in the _bincount function
         import dask.array as dsa
+
         bin_counts = dsa.map_blocks(
             _bincount,
-            all_arrays, weights, axis, bins, density,
+            all_arrays,
+            weights,
+            axis,
+            bins,
+            density,
             drop_axis=axis,
             new_axis="what?",
             chunks="what?",
-            meta=np.array((), dtype=np.int64)
+            meta=np.array((), dtype=np.int64),
         )
         # sum over the block dims
         block_dims = "?"
diff --git a/xhistogram/duck_array_ops.py b/xhistogram/duck_array_ops.py
index b049252..9acea73 100644
--- a/xhistogram/duck_array_ops.py
+++ b/xhistogram/duck_array_ops.py
@@ -10,13 +10,16 @@
 except ImportError:
     has_dask = False
 
+
 def _any_dask_array(*args):
     if not has_dask:
         return False
     else:
         import dask.array as dsa
+
         return any([isinstance(a, dsa.core.Array) for a in args])
 
+
 def _dask_or_eager_func(name, eager_module=np, list_of_args=False, n_array_args=1):
     """Create a function that dispatches to dask for dask array inputs."""
     if has_dask:

From 7b706e8f1fda7740ed0468183e9d80016ad783c9 Mon Sep 17 00:00:00 2001
From: Ryan Abernathey <ryan.abernathey@gmail.com>
Date: Thu, 29 Apr 2021 22:28:23 -0400
Subject: [PATCH 05/21] pre-commit

---
 xhistogram/core.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 791d3b8..c3b3430 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -159,7 +159,6 @@ def _bincount(all_arrays, weights, axis, bins, density):
     else:
         weights_broadcast = None
     a0 = all_arrays_broadcast[0]
-    ndim = a0.ndim
 
     do_full_array = (axis is None) or (set(axis) == set(range(a0.ndim)))
     if do_full_array:

From 1d01ae42fc64066a0b911df7dd398210bee7dd46 Mon Sep 17 00:00:00 2001
From: Ryan Abernathey <ryan.abernathey@gmail.com>
Date: Fri, 14 May 2021 22:35:34 -0400
Subject: [PATCH 06/21] rewrote; working without dask

---
 xhistogram/core.py | 70 +++++++++++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index c3b3430..fe893e7 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -147,17 +147,11 @@ def _bincount_2d_vectorized(
     return bin_counts
 
 
-def _bincount(all_arrays, weights, axis, bins, density):
-
-    if weights is not None:
-        all_arrays.append(weights)
+def _bincount(*all_arrays, weights=False, axis=None, bins=None, density=None):
 
+    # is this necessary?
     all_arrays_broadcast = broadcast_arrays(*all_arrays)
 
-    if weights is not None:
-        weights_broadcast = all_arrays_broadcast.pop()
-    else:
-        weights_broadcast = None
     a0 = all_arrays_broadcast[0]
 
     do_full_array = (axis is None) or (set(axis) == set(range(a0.ndim)))
@@ -186,13 +180,13 @@ def reshape_input(a):
 
     all_arrays_reshaped = [reshape_input(a) for a in all_arrays_broadcast]
 
-    if weights_broadcast is not None:
-        weights_reshaped = reshape_input(weights_broadcast)
+    if weights:
+        weights_array = all_arrays_reshaped.pop()
     else:
-        weights_reshaped = None
+        weights_array = None
 
     bin_counts = _bincount_2d_vectorized(
-        *all_arrays_reshaped, bins=bins, weights=weights_reshaped, density=density
+        *all_arrays_reshaped, bins=bins, weights=weights_array, density=density
     )
 
     if bin_counts.shape[0] == 1:
@@ -202,6 +196,9 @@ def reshape_input(a):
         final_shape = kept_axes_shape + bin_counts.shape[1:]
         bin_counts = reshape(bin_counts, final_shape)
 
+    # now add even one more dimension (last dimension) to sum over
+    bin_counts = reshape(bin_counts, bin_counts.shape + (1,))
+
     return bin_counts
 
 
@@ -284,10 +281,19 @@ def histogram(
         axis = [int(i) for i in axis_normed]
 
     all_arrays = list(args)
-
     n_inputs = len(all_arrays)
+
+    if weights is not None:
+        all_arrays.append(weights)
+        has_weights = True
+    else:
+        has_weights = False
+
+    dtype = 'i8' if not has_weights else weights.dtype
+
     bins = _ensure_bins_is_a_list_of_arrays(bins, n_inputs)
 
+    bincount_kwargs = dict(weights=has_weights, axis=axis, bins=bins, density=density)
     if _any_dask_array(weights, *all_arrays):
         raise NotImplementedError("No dask allowed for now")
         # We should be able to just apply the bin_count function to every
@@ -297,23 +303,35 @@ def histogram(
         # over in the _bincount function
         import dask.array as dsa
 
-        bin_counts = dsa.map_blocks(
+        # here I am assuming all the arrays have the same shape
+        # probably needs to be generalized
+        input_indexes = [tuple(range(a.ndim)) for a in all_arrays]
+        input_index = input_indexes[0]
+        assert all([ii == input_index for ii in input_indexes])
+        # keep these axes in the inputs
+        keep_axes = tuple([ii for ii in input_index if ii not in axis])
+        new_axes = {max(input_index) + i: axis_len
+                    for i, axis_len in enumerate([len(bin) for bin in bins] + [1,])}
+        out_index = keep_axes + tuple(new_axes)
+
+        blockwise_args = []
+        for arg in all_arrays:
+            blockwise_args.append(arg)
+            blockwise_args.append(input_index)
+
+        bin_counts = dsa.blockwise(
             _bincount,
-            all_arrays,
-            weights,
-            axis,
-            bins,
-            density,
-            drop_axis=axis,
-            new_axis="what?",
-            chunks="what?",
-            meta=np.array((), dtype=np.int64),
+            out_index,
+            *blockwise_args,
+            new_axes=new_axes,
+            meta=np.array((), dtype)
+            **bincount_kwargs
         )
         # sum over the block dims
-        block_dims = "?"
-        bin_counts = bin_counts.sum(block_dims)
+        bin_counts = bin_counts.sum(-1)
     else:
-        bin_counts = _bincount(all_arrays, weights, axis, bins, density)
+        # drop the extra axis used for summing over blocks
+        bin_counts = _bincount(*all_arrays, **bincount_kwargs)[..., 0]
 
     if density:
         # Normalise by dividing by bin counts and areas such that all the

From ed936334e10071291f26a71e8f38dcda82458dfe Mon Sep 17 00:00:00 2001
From: Ryan Abernathey <ryan.abernathey@gmail.com>
Date: Fri, 14 May 2021 22:35:47 -0400
Subject: [PATCH 07/21] got it working with blockwise!

---
 xhistogram/core.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index fe893e7..dec9992 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -295,7 +295,6 @@ def histogram(
 
     bincount_kwargs = dict(weights=has_weights, axis=axis, bins=bins, density=density)
     if _any_dask_array(weights, *all_arrays):
-        raise NotImplementedError("No dask allowed for now")
         # We should be able to just apply the bin_count function to every
         # block and then sum over all blocks to get the total bin count.
         # The main challenge is to figure out the chunk shape that will come
@@ -309,9 +308,12 @@ def histogram(
         input_index = input_indexes[0]
         assert all([ii == input_index for ii in input_indexes])
         # keep these axes in the inputs
-        keep_axes = tuple([ii for ii in input_index if ii not in axis])
-        new_axes = {max(input_index) + i: axis_len
-                    for i, axis_len in enumerate([len(bin) for bin in bins] + [1,])}
+        if axis is not None:
+            keep_axes = tuple([ii for ii in input_index if ii not in axis])
+        else:
+            keep_axes = ()
+        new_axes = {max(input_index) + 1 + i: axis_len
+                    for i, axis_len in enumerate([len(bin) - 1 for bin in bins] + [1,])}
         out_index = keep_axes + tuple(new_axes)
 
         blockwise_args = []

From 9f20e951b002a27a7aa891b52310d8cabef8a0bf Mon Sep 17 00:00:00 2001
From: Ryan Abernathey <ryan.abernathey@gmail.com>
Date: Fri, 14 May 2021 22:41:16 -0400
Subject: [PATCH 08/21] pre-commit

---
 xhistogram/core.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index dec9992..235127c 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -289,7 +289,7 @@ def histogram(
     else:
         has_weights = False
 
-    dtype = 'i8' if not has_weights else weights.dtype
+    dtype = "i8" if not has_weights else weights.dtype
 
     bins = _ensure_bins_is_a_list_of_arrays(bins, n_inputs)
 
@@ -312,8 +312,15 @@ def histogram(
             keep_axes = tuple([ii for ii in input_index if ii not in axis])
         else:
             keep_axes = ()
-        new_axes = {max(input_index) + 1 + i: axis_len
-                    for i, axis_len in enumerate([len(bin) - 1 for bin in bins] + [1,])}
+        new_axes = {
+            max(input_index) + 1 + i: axis_len
+            for i, axis_len in enumerate(
+                [len(bin) - 1 for bin in bins]
+                + [
+                    1,
+                ]
+            )
+        }
         out_index = keep_axes + tuple(new_axes)
 
         blockwise_args = []
@@ -326,8 +333,7 @@ def histogram(
             out_index,
             *blockwise_args,
             new_axes=new_axes,
-            meta=np.array((), dtype)
-            **bincount_kwargs
+            meta=np.array((), dtype) ** bincount_kwargs,
         )
         # sum over the block dims
         bin_counts = bin_counts.sum(-1)

From 3b4939b2edb7f276cf7a3a3a3b9c3d77557d1553 Mon Sep 17 00:00:00 2001
From: Ryan Abernathey <ryan.abernathey@gmail.com>
Date: Mon, 24 May 2021 17:38:36 -0400
Subject: [PATCH 09/21] fix syntax bug

---
 xhistogram/core.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 235127c..80dd142 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -333,7 +333,8 @@ def histogram(
             out_index,
             *blockwise_args,
             new_axes=new_axes,
-            meta=np.array((), dtype) ** bincount_kwargs,
+            meta=np.array((), dtype), 
+            **bincount_kwargs,
         )
         # sum over the block dims
         bin_counts = bin_counts.sum(-1)

From 9c7c7227c27af3f663261c5a6ad7a2022ea0fbbc Mon Sep 17 00:00:00 2001
From: Ryan Abernathey <ryan.abernathey@gmail.com>
Date: Mon, 24 May 2021 22:47:53 -0400
Subject: [PATCH 10/21] fix broken stuff

---
 xhistogram/core.py | 68 ++++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 80dd142..b2428bc 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -155,10 +155,13 @@ def _bincount(*all_arrays, weights=False, axis=None, bins=None, density=None):
     a0 = all_arrays_broadcast[0]
 
     do_full_array = (axis is None) or (set(axis) == set(range(a0.ndim)))
+
     if do_full_array:
-        kept_axes_shape = None
+        kept_axes_shape = (1,) * a0.ndim
     else:
-        kept_axes_shape = tuple([a0.shape[i] for i in range(a0.ndim) if i not in axis])
+        kept_axes_shape = tuple(
+            [a0.shape[i] if i not in axis else 1 for i in range(a0.ndim)]
+        )
 
     def reshape_input(a):
         if do_full_array:
@@ -171,7 +174,7 @@ def reshape_input(a):
             c = np.moveaxis(a, axis, new_pos)
             split_idx = c.ndim - len(axis)
             dims_0 = c.shape[:split_idx]
-            assert dims_0 == kept_axes_shape
+            # assert dims_0 == kept_axes_shape
             dims_1 = c.shape[split_idx:]
             new_dim_0 = np.prod(dims_0)
             new_dim_1 = np.prod(dims_1)
@@ -189,15 +192,8 @@ def reshape_input(a):
         *all_arrays_reshaped, bins=bins, weights=weights_array, density=density
     )
 
-    if bin_counts.shape[0] == 1:
-        assert do_full_array
-        bin_counts = bin_counts.squeeze()
-    else:
-        final_shape = kept_axes_shape + bin_counts.shape[1:]
-        bin_counts = reshape(bin_counts, final_shape)
-
-    # now add even one more dimension (last dimension) to sum over
-    bin_counts = reshape(bin_counts, bin_counts.shape + (1,))
+    final_shape = kept_axes_shape + bin_counts.shape[1:]
+    bin_counts = reshape(bin_counts, final_shape)
 
     return bin_counts
 
@@ -294,6 +290,19 @@ def histogram(
     bins = _ensure_bins_is_a_list_of_arrays(bins, n_inputs)
 
     bincount_kwargs = dict(weights=has_weights, axis=axis, bins=bins, density=density)
+
+    # here I am assuming all the arrays have the same shape
+    # probably needs to be generalized
+    input_indexes = [tuple(range(a.ndim)) for a in all_arrays]
+    input_index = input_indexes[0]
+    assert all([ii == input_index for ii in input_indexes])
+
+    # keep these axes in the inputs
+    if axis is not None:
+        drop_axes = tuple([ii for ii in input_index if ii in axis])
+    else:
+        drop_axes = input_index
+
     if _any_dask_array(weights, *all_arrays):
         # We should be able to just apply the bin_count function to every
         # block and then sum over all blocks to get the total bin count.
@@ -302,26 +311,20 @@ def histogram(
         # over in the _bincount function
         import dask.array as dsa
 
-        # here I am assuming all the arrays have the same shape
-        # probably needs to be generalized
-        input_indexes = [tuple(range(a.ndim)) for a in all_arrays]
-        input_index = input_indexes[0]
-        assert all([ii == input_index for ii in input_indexes])
-        # keep these axes in the inputs
-        if axis is not None:
-            keep_axes = tuple([ii for ii in input_index if ii not in axis])
-        else:
-            keep_axes = ()
+        # Important note from blockwise docs
+        # > Any index, like i missing from the output index is interpreted as a contraction...
+        # > In the case of a contraction the passed function should expect an iterable of blocks
+        # > on any array that holds that index.
+        # This means that we need to have all the input indexes present in the output index
+        # However, they will be reduced to singleton (len 1) dimensions
+
+        adjust_chunks = {i: (lambda x: 1) for i in drop_axes}
+
         new_axes = {
             max(input_index) + 1 + i: axis_len
-            for i, axis_len in enumerate(
-                [len(bin) - 1 for bin in bins]
-                + [
-                    1,
-                ]
-            )
+            for i, axis_len in enumerate([len(bin) - 1 for bin in bins])
         }
-        out_index = keep_axes + tuple(new_axes)
+        out_index = input_index + tuple(new_axes)
 
         blockwise_args = []
         for arg in all_arrays:
@@ -333,14 +336,15 @@ def histogram(
             out_index,
             *blockwise_args,
             new_axes=new_axes,
-            meta=np.array((), dtype), 
+            adjust_chunks=adjust_chunks,
+            meta=np.array((), dtype),
             **bincount_kwargs,
         )
         # sum over the block dims
-        bin_counts = bin_counts.sum(-1)
+        bin_counts = bin_counts.sum(drop_axes)
     else:
         # drop the extra axis used for summing over blocks
-        bin_counts = _bincount(*all_arrays, **bincount_kwargs)[..., 0]
+        bin_counts = _bincount(*all_arrays, **bincount_kwargs).squeeze(drop_axes)
 
     if density:
         # Normalise by dividing by bin counts and areas such that all the

From ca331cdd38b8fe97df5c1aabbbf0ea409eba8e3a Mon Sep 17 00:00:00 2001
From: Ryan Abernathey <ryan.abernathey@gmail.com>
Date: Tue, 1 Jun 2021 12:41:39 -0400
Subject: [PATCH 11/21] remove duck_array_ops  module

---
 xhistogram/core.py                     | 16 +++++-
 xhistogram/duck_array_ops.py           | 48 ----------------
 xhistogram/test/test_duck_array_ops.py | 80 --------------------------
 3 files changed, 15 insertions(+), 129 deletions(-)
 delete mode 100644 xhistogram/duck_array_ops.py
 delete mode 100644 xhistogram/test/test_duck_array_ops.py

diff --git a/xhistogram/core.py b/xhistogram/core.py
index b2428bc..13ce19f 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -13,7 +13,21 @@
     concatenate,
     broadcast_arrays,
 )
-from .duck_array_ops import _any_dask_array
+
+
+try:
+    import dask.array as dsa
+
+    has_dask = True
+except ImportError:
+    has_dask = False
+
+
+def _any_dask_array(*args):
+    if not has_dask:
+        return False
+    else:
+        return any(isinstance(a, dsa.core.Array) for a in args)
 
 
 def _ensure_bins_is_a_list_of_arrays(bins, N_expected):
diff --git a/xhistogram/duck_array_ops.py b/xhistogram/duck_array_ops.py
deleted file mode 100644
index 9acea73..0000000
--- a/xhistogram/duck_array_ops.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""Compatibility module defining operations on duck numpy-arrays.
-Shamelessly copied from xarray."""
-
-import numpy as np
-
-try:
-    import dask.array as dsa
-
-    has_dask = True
-except ImportError:
-    has_dask = False
-
-
-def _any_dask_array(*args):
-    if not has_dask:
-        return False
-    else:
-        import dask.array as dsa
-
-        return any([isinstance(a, dsa.core.Array) for a in args])
-
-
-def _dask_or_eager_func(name, eager_module=np, list_of_args=False, n_array_args=1):
-    """Create a function that dispatches to dask for dask array inputs."""
-    if has_dask:
-
-        def f(*args, **kwargs):
-            dispatch_args = args[0] if list_of_args else args
-            if any(isinstance(a, dsa.Array) for a in dispatch_args[:n_array_args]):
-                module = dsa
-            else:
-                module = eager_module
-            return getattr(module, name)(*args, **kwargs)
-
-    else:
-
-        def f(*args, **kwargs):
-            return getattr(eager_module, name)(*args, **kwargs)
-
-    return f
-
-
-digitize = _dask_or_eager_func("digitize")
-bincount = _dask_or_eager_func("bincount")
-reshape = _dask_or_eager_func("reshape")
-concatenate = _dask_or_eager_func("concatenate", list_of_args=True)
-broadcast_arrays = _dask_or_eager_func("broadcast_arrays")
-ravel_multi_index = _dask_or_eager_func("ravel_multi_index")
diff --git a/xhistogram/test/test_duck_array_ops.py b/xhistogram/test/test_duck_array_ops.py
deleted file mode 100644
index db3cbd6..0000000
--- a/xhistogram/test/test_duck_array_ops.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import numpy as np
-import dask.array as dsa
-from ..duck_array_ops import (
-    digitize,
-    bincount,
-    reshape,
-    ravel_multi_index,
-    broadcast_arrays,
-)
-from .fixtures import empty_dask_array
-import pytest
-
-
-@pytest.mark.parametrize(
-    "function, args",
-    [
-        (digitize, [np.random.rand(5, 12), np.linspace(0, 1, 7)]),
-        (bincount, [np.arange(10)]),
-    ],
-)
-def test_eager(function, args):
-    a = function(*args)
-    assert isinstance(a, np.ndarray)
-
-
-@pytest.mark.parametrize(
-    "function, args, kwargs",
-    [
-        (digitize, [empty_dask_array((5, 12)), np.linspace(0, 1, 7)], {}),
-        (bincount, [empty_dask_array((10,))], {"minlength": 5}),
-        (reshape, [empty_dask_array((10, 5)), (5, 10)], {}),
-        (ravel_multi_index, (empty_dask_array((10,)), empty_dask_array((10,))), {}),
-    ],
-)
-def test_lazy(function, args, kwargs):
-    # make sure nothing computes
-    a = function(*args, **kwargs)
-    assert isinstance(a, dsa.core.Array)
-
-
-@pytest.mark.parametrize("chunks", [(5, 12), (1, 12), (5, 1)])
-def test_digitize_dask_correct(chunks):
-    a = np.random.rand(5, 12)
-    da = dsa.from_array(a, chunks=chunks)
-    bins = np.linspace(0, 1, 7)
-    d = digitize(a, bins)
-    dd = digitize(da, bins)
-    np.testing.assert_array_equal(d, dd.compute())
-
-
-def test_ravel_multi_index_correct():
-    arr = np.array([[3, 6, 6], [4, 5, 1]])
-    expected = np.ravel_multi_index(arr, (7, 6))
-    actual = ravel_multi_index(arr, (7, 6))
-    np.testing.assert_array_equal(expected, actual)
-
-    expected = np.ravel_multi_index(arr, (7, 6), order="F")
-    actual = ravel_multi_index(arr, (7, 6), order="F")
-    np.testing.assert_array_equal(expected, actual)
-
-
-def test_broadcast_arrays_numpy():
-    a1 = np.empty((1, 5, 25))
-    a2 = np.empty((4, 1, 1))
-
-    a1b, a2b = broadcast_arrays(a1, a2)
-    assert a1b.shape == (4, 5, 25)
-    assert a2b.shape == (4, 5, 25)
-
-
-@pytest.mark.parametrize("d1_chunks", [(5 * (1,), (25,)), ((2, 3), (25,))])
-def test_broadcast_arrays_dask(d1_chunks):
-    d1 = dsa.empty((5, 25), chunks=d1_chunks)
-    d2 = dsa.empty((1, 25), chunks=(1, 25))
-
-    d1b, d2b = broadcast_arrays(d1, d2)
-    assert d1b.shape == (5, 25)
-    assert d2b.shape == (5, 25)
-    assert d1b.chunks == d1_chunks
-    assert d2b.chunks == d1_chunks

From 48cec393da79cff2802742de41272adb3aea1d67 Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <tegn500@york.ac.uk>
Date: Fri, 4 Jun 2021 14:08:31 -0400
Subject: [PATCH 12/21] drop weights argument to np.histogram_bin_edges to
 ensure tests work

---
 xhistogram/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 692c245..e470744 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -378,7 +378,7 @@ def histogram(
             )
     else:
         bins = [
-            np.histogram_bin_edges(a, b, r, weights)
+            np.histogram_bin_edges(a, b, r)
             for a, b, r in zip(all_arrays, bins, range)
         ]
     bincount_kwargs = dict(weights=has_weights, axis=axis, bins=bins, density=density)

From 8185fa77e4987c25247f8eb4f1ab87957c6d7992 Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <tegn500@york.ac.uk>
Date: Fri, 4 Jun 2021 16:50:44 -0400
Subject: [PATCH 13/21] simple tests

---
 xhistogram/test/fixtures.py    |  8 ++++++
 xhistogram/test/test_xarray.py | 52 ++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/xhistogram/test/fixtures.py b/xhistogram/test/fixtures.py
index 330d07b..d6d6ee8 100644
--- a/xhistogram/test/fixtures.py
+++ b/xhistogram/test/fixtures.py
@@ -1,5 +1,7 @@
 import dask
 import dask.array as dsa
+import numpy as np
+import xarray as xr
 
 
 def empty_dask_array(shape, dtype=float, chunks=None):
@@ -12,3 +14,9 @@ def raise_if_computed():
         a = a.rechunk(chunks)
 
     return a
+
+def example_dataarray(shape=(5, 20)):
+    data = np.random.randn(*shape)
+    dims = [f"dim_{i}" for i in range(len(shape))]
+    da = xr.DataArray(data, dims=dims, name="T")
+    return da
diff --git a/xhistogram/test/test_xarray.py b/xhistogram/test/test_xarray.py
index 7170537..4e64213 100644
--- a/xhistogram/test/test_xarray.py
+++ b/xhistogram/test/test_xarray.py
@@ -4,6 +4,7 @@
 import pandas as pd
 from itertools import combinations
 
+from .fixtures import example_dataarray
 from ..xarray import histogram
 
 
@@ -206,3 +207,54 @@ def test_input_type_check():
     np_array = np.arange(100)
     with pytest.raises(TypeError):
         histogram(np_array)
+
+
+class TestMultiDimensionalBins:
+    def test_bin_dataarrays_with_extra_dims(self):
+        data = xr.DataArray([0], dims=["x"], name="a")
+        bins = xr.DataArray([[1]], dims=["bad", "a_bins"])
+        with pytest.raises(ValueError, match="will not be broadcast"):
+            histogram(data, dim="x", bins=[bins])
+
+    def test_bin_dataarrays_without_reduce_dim(self):
+        data = xr.DataArray([0], dims=["x"], name="a")
+        bins = xr.DataArray(1)
+        with pytest.raises(ValueError, match="does not contain"):
+            histogram(data, dim="x", bins=[bins])
+
+    # TODO parametrize over ndims?
+    def test_1d_bins_da(self):
+        data_a = example_dataarray(shape=(10, 12))
+        nbins_a = 8
+        bins_a = xr.DataArray(np.linspace(-4, 4, nbins_a + 1),
+                              dims=f'{data_a.name}_bins')
+
+        h = histogram(data_a, bins=[bins_a])
+
+        assert h.shape == (nbins_a,)
+
+        hist, _ = np.histogram(data_a.values, bins=bins_a)
+
+        np.testing.assert_allclose(hist, h.values)
+
+    def test_2d_bins_da(self):
+        data_a = example_dataarray(shape=(10, 2))
+        nbins_a = 7
+        bins_a = xr.DataArray([np.linspace(-4, 3, nbins_a + 1),
+                              np.linspace(-3, 4, nbins_a + 1)],
+                              dims=['dim_1', f'{data_a.name}_bins'])
+
+        h = histogram(data_a, dim='dim_0', bins=[bins_a])
+
+        assert h.shape == (nbins_a, 2)
+
+        def _np_hist(*args, **kwargs):
+            h, _ = np.histogram(*args, **kwargs)
+            return h
+
+        hist = np.apply_along_axis(
+            _np_hist, axis=0, arr=data_a.values,
+            bins=[bins_a]
+        )
+
+        np.testing.assert_allclose(hist, h.values)

From 9200266ca559262c6985f4568db1d23d9e6f1822 Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <tegn500@york.ac.uk>
Date: Fri, 4 Jun 2021 16:51:25 -0400
Subject: [PATCH 14/21] allow bins to be dataarrays and pass them down into
 bincount

---
 xhistogram/core.py   | 39 ++++++++++++++++++++++++++++-----------
 xhistogram/xarray.py | 23 +++++++++++++++++++++--
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index e470744..9e8abba 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -135,7 +135,7 @@ def _dispatch_bincount(bin_indices, weights, N, hist_shapes, block_size=None):
 
 
 def _bincount_2d_vectorized(
-    *args, bins=None, weights=None, density=False, right=False, block_size=None
+    *args, bins=None, weights=None, block_size=None
 ):
     """Calculate the histogram independently on each row of a 2D array"""
 
@@ -192,10 +192,17 @@ def _bincount_2d_vectorized(
     return bin_counts
 
 
-def _bincount(*all_arrays, weights=False, axis=None, bins=None, density=None):
+def _bincount(*all_arrays, weights=False, axis=None, density=None):
+
+    # TODO a more robust way to pass the bins together with the arrays
+    n_args = len(all_arrays) // 2
+    arrays = all_arrays[:n_args]
+    print(arrays[0].shape)
+    bins = all_arrays[n_args:]
+    print(bins[0].shape)
 
     # is this necessary?
-    all_arrays_broadcast = broadcast_arrays(*all_arrays)
+    all_arrays_broadcast = broadcast_arrays(*arrays)
 
     a0 = all_arrays_broadcast[0]
 
@@ -227,6 +234,7 @@ def reshape_input(a):
         return d
 
     all_arrays_reshaped = [reshape_input(a) for a in all_arrays_broadcast]
+    bins_reshaped = [reshape_input(b) for b in bins]
 
     if weights:
         weights_array = all_arrays_reshaped.pop()
@@ -234,7 +242,7 @@ def reshape_input(a):
         weights_array = None
 
     bin_counts = _bincount_2d_vectorized(
-        *all_arrays_reshaped, bins=bins, weights=weights_array, density=density
+        *all_arrays_reshaped, bins=bins_reshaped, weights=weights_array
     )
 
     final_shape = kept_axes_shape + bin_counts.shape[1:]
@@ -365,7 +373,7 @@ def histogram(
     assert all([ii == input_index for ii in input_indexes])
 
     # Some sanity checks and format bins and range correctly
-    bins = _ensure_correctly_formatted_bins(bins, n_inputs)
+    formatted_bins = _ensure_correctly_formatted_bins(bins, n_inputs)
     range = _ensure_correctly_formatted_range(range, n_inputs)
 
     # histogram_bin_edges trigges computation on dask arrays. It would be possible
@@ -377,11 +385,14 @@ def histogram(
                 "When using dask arrays, bins must be provided as numpy array(s) of edges"
             )
     else:
-        bins = [
-            np.histogram_bin_edges(a, b, r)
-            for a, b, r in zip(all_arrays, bins, range)
-        ]
-    bincount_kwargs = dict(weights=has_weights, axis=axis, bins=bins, density=density)
+        bins = []
+        for a, b, r in zip(all_arrays, formatted_bins, range):
+            if isinstance(b, np.ndarray):
+                pass
+            else:
+                b = np.histogram_bin_edges(a, b, r)
+            bins.append(b)
+    bincount_kwargs = dict(weights=has_weights, axis=axis, density=density)
 
     # keep these axes in the inputs
     if axis is not None:
@@ -417,6 +428,12 @@ def histogram(
             blockwise_args.append(arg)
             blockwise_args.append(input_index)
 
+        # Bins arrays do not contain axes which will get reduced along
+        bins_input_index = tuple(ii for ii in out_index if ii not in drop_axes)
+        for b in bins:
+            blockwise_args.append(b)
+            blockwise_args.append(bins_input_index)
+
         bin_counts = dsa.blockwise(
             _bincount,
             out_index,
@@ -430,7 +447,7 @@ def histogram(
         bin_counts = bin_counts.sum(drop_axes)
     else:
         # drop the extra axis used for summing over blocks
-        bin_counts = _bincount(*all_arrays, **bincount_kwargs).squeeze(drop_axes)
+        bin_counts = _bincount(*(all_arrays + bins), **bincount_kwargs).squeeze(drop_axes)
 
     if density:
         # Normalise by dividing by bin counts and areas such that all the
diff --git a/xhistogram/xarray.py b/xhistogram/xarray.py
index a587abc..a74b5cd 100644
--- a/xhistogram/xarray.py
+++ b/xhistogram/xarray.py
@@ -29,7 +29,7 @@ def histogram(
         Input data. The number of input arguments determines the dimensonality of
         the histogram. For example, two arguments prodocue a 2D histogram. All
         args must be aligned and have the same dimensions.
-    bins :  int, str or numpy array or a list of ints, strs and/or arrays, optional
+    bins :  int, str, numpy array or DataArray, or a list of ints, strs, arrays and/or DataArrays, optional
         If a list, there should be one entry for each item in ``args``.
         The bin specifications are as follows:
 
@@ -37,6 +37,10 @@ def histogram(
           * If str; the method used to automatically calculate the optimal bin width
             for all arguments in ``args``, as defined by numpy `histogram_bin_edges`.
           * If numpy array; the bin edges for all arguments in ``args``.
+          * If xarray DataArray: the bin edges for all arguments in ``args``.
+            The DataArray can be multidimensional, but must contain the output
+            bins dimension (i.e. `[var]_bins`) and must not have any dimensions
+            present in the `dim` argument.
           * If a list of ints, strs and/or arrays; the bin specification as
             above for every argument in ``args``.
 
@@ -153,6 +157,21 @@ def histogram(
     else:
         weights_data = None
 
+    if isinstance(dim, str):
+        dim = (dim,)
+
+    # align bins if DataArrays
+    # TODO check correct dimensions exist
+    # Drop dimensions that will be reduced along before aligning bins
+    output_shape = a0.isel(**{d: 0 for d in dim}, drop=True)
+    aligned_bins = []
+    for bin in bins:
+        if isinstance(bin, xr.DataArray):
+            aligned_bin, _ = xr.align(bin, output_shape, join="exact")
+            aligned_bins.append(aligned_bin.values)
+        else:
+            aligned_bins.append(bin)
+
     if dim is not None:
         dims_to_keep = [d for d in all_dims_ordered if d not in dim]
         axis = [args_transposed[0].get_axis_num(d) for d in dim]
@@ -163,7 +182,7 @@ def histogram(
     h_data, bins = _histogram(
         *args_data,
         weights=weights_data,
-        bins=bins,
+        bins=aligned_bins,
         range=range,
         axis=axis,
         density=density,

From b1ab869de7a00b6b2ae8341d5a38e0666284869a Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <tegn500@york.ac.uk>
Date: Mon, 7 Jun 2021 09:52:40 -0400
Subject: [PATCH 15/21] output has right shape, wrong values

---
 xhistogram/core.py             | 29 ++++++++++++---
 xhistogram/test/test_xarray.py | 22 +++++++-----
 xhistogram/xarray.py           | 65 +++++++++++++++++++++++-----------
 3 files changed, 82 insertions(+), 34 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 9e8abba..9c36bcd 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -145,14 +145,12 @@ def _bincount_2d_vectorized(
     # consistency checks for inputa
     for a, b in zip(args, bins):
         assert a.ndim == 2
-        assert b.ndim == 1
+        #assert b.ndim == 1
         assert a.shape == a0.shape
     if weights is not None:
         assert weights.shape == a0.shape
 
     nrows, ncols = a0.shape
-    nbins = [len(b) for b in bins]
-    hist_shapes = [nb + 1 for nb in nbins]
 
     # a marginally faster implementation would be to use searchsorted,
     # like numpy histogram itself does
@@ -164,14 +162,35 @@ def _bincount_2d_vectorized(
     # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/calibration.py#L592
     # but a better approach would be to use something like _search_sorted_inclusive() in
     # numpy histogram. This is an additional motivation for moving to searchsorted
-    bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]
+    #bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]
 
     # the maximum possible value of of digitize is nbins
     # for right=False:
     #   - 0 corresponds to a < b[0]
     #   - i corresponds to bins[i-1] <= a < b[i]
     #   - nbins corresponds to a a >= b[1]
-    each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
+
+    # TODO assuming all bins have same form here
+    b = bins[0]
+    if b.ndim == 1:
+        nbins = [len(b) for b in bins]
+        hist_shapes = [nb + 1 for nb in nbins]
+
+        each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
+    elif b.ndim == 2:
+        nbins = [b.shape[1] for b in bins]
+        hist_shapes = [nb + 1 for nb in nbins]
+
+        # Apply digitize separately to each row with different bins
+        each_bin_indices = []
+        for a, b in zip(args, bins):
+            each_bin_indices_single_var = np.stack([digitize(a[row, :], b[row, :])
+                                                   for row in np.arange(a.shape[0])])
+            each_bin_indices.append(each_bin_indices_single_var)
+
+    # TODO check if this array is correct!
+    print(each_bin_indices)
+
     # product of the bins gives the joint distribution
     if N_inputs > 1:
         bin_indices = ravel_multi_index(each_bin_indices, hist_shapes)
diff --git a/xhistogram/test/test_xarray.py b/xhistogram/test/test_xarray.py
index 4e64213..f022171 100644
--- a/xhistogram/test/test_xarray.py
+++ b/xhistogram/test/test_xarray.py
@@ -212,7 +212,7 @@ def test_input_type_check():
 class TestMultiDimensionalBins:
     def test_bin_dataarrays_with_extra_dims(self):
         data = xr.DataArray([0], dims=["x"], name="a")
-        bins = xr.DataArray([[1]], dims=["bad", "a_bins"])
+        bins = xr.DataArray([[1]], dims=["bad", "a_bin"])
         with pytest.raises(ValueError, match="will not be broadcast"):
             histogram(data, dim="x", bins=[bins])
 
@@ -227,7 +227,7 @@ def test_1d_bins_da(self):
         data_a = example_dataarray(shape=(10, 12))
         nbins_a = 8
         bins_a = xr.DataArray(np.linspace(-4, 4, nbins_a + 1),
-                              dims=f'{data_a.name}_bins')
+                              dims=f'{data_a.name}_bin')
 
         h = histogram(data_a, bins=[bins_a])
 
@@ -242,19 +242,25 @@ def test_2d_bins_da(self):
         nbins_a = 7
         bins_a = xr.DataArray([np.linspace(-4, 3, nbins_a + 1),
                               np.linspace(-3, 4, nbins_a + 1)],
-                              dims=['dim_1', f'{data_a.name}_bins'])
+                              dims=['dim_1', f'{data_a.name}_bin'])
 
         h = histogram(data_a, dim='dim_0', bins=[bins_a])
 
-        assert h.shape == (nbins_a, 2)
+        print(h)
+
+        assert h.shape == (2, nbins_a)
 
         def _np_hist(*args, **kwargs):
             h, _ = np.histogram(*args, **kwargs)
             return h
 
-        hist = np.apply_along_axis(
-            _np_hist, axis=0, arr=data_a.values,
-            bins=[bins_a]
-        )
+        #hist = np.apply_along_axis(
+        #    _np_hist, axis=0, arr=data_a.values,
+        #    bins=[bins_a]
+        #)
+
+
+        hist = np.stack([_np_hist(data_a.values[0, :], bins=bins_a[0, :]),
+                         _np_hist(data_a.values[1, :], bins=bins_a[1, :])])
 
         np.testing.assert_allclose(hist, h.values)
diff --git a/xhistogram/xarray.py b/xhistogram/xarray.py
index a74b5cd..1c6f081 100644
--- a/xhistogram/xarray.py
+++ b/xhistogram/xarray.py
@@ -160,52 +160,70 @@ def histogram(
     if isinstance(dim, str):
         dim = (dim,)
 
+    if dim is not None:
+        dims_to_keep = [d for d in all_dims_ordered if d not in dim]
+        axis = [args_transposed[0].get_axis_num(d) for d in dim]
+    else:
+        dims_to_keep = []
+        axis = None
+
+    # create output dims
+    new_dims = [a.name + bin_dim_suffix for a in args[:N_args]]
+    output_dims = dims_to_keep + new_dims
+
     # align bins if DataArrays
+    # Need to align so that the var_bins dim is last, similar to the reduce dims on the data
     # TODO check correct dimensions exist
     # Drop dimensions that will be reduced along before aligning bins
-    output_shape = a0.isel(**{d: 0 for d in dim}, drop=True)
+    if dim is None:
+        output_shape = a0
+    else:
+        output_shape = a0.isel(**{d: 0 for d in dim}, drop=True)
     aligned_bins = []
-    for bin in bins:
+    for bin, new_dim in zip(bins, new_dims):
         if isinstance(bin, xr.DataArray):
             aligned_bin, _ = xr.align(bin, output_shape, join="exact")
-            aligned_bins.append(aligned_bin.values)
+            if new_dim not in aligned_bin.dims:
+                raise ValueError(f"bins DataArray does not contain dimension {new_dim}")
+            aligned_bin = aligned_bin.transpose(new_dim, ...)
+            aligned_bins.append(aligned_bin)
+            print(aligned_bin)
         else:
             aligned_bins.append(bin)
 
-    if dim is not None:
-        dims_to_keep = [d for d in all_dims_ordered if d not in dim]
-        axis = [args_transposed[0].get_axis_num(d) for d in dim]
-    else:
-        dims_to_keep = []
-        axis = None
-
     h_data, bins = _histogram(
         *args_data,
         weights=weights_data,
-        bins=aligned_bins,
+        bins=[b.values for b in aligned_bins],
         range=range,
         axis=axis,
         density=density,
         block_size=block_size,
     )
 
-    # create output dims
-    new_dims = [a.name + bin_dim_suffix for a in args[:N_args]]
-    output_dims = dims_to_keep + new_dims
-
     # create new coords
-    bin_centers = [0.5 * (bin[:-1] + bin[1:]) for bin in bins]
-    new_coords = {
-        name: ((name,), bin_center, a.attrs)
-        for name, bin_center, a in zip(new_dims, bin_centers, args)
-    }
+    #bin_centers = [0.5 * (bin[:-1] + bin[1:]) for bin in bins]
+    #new_coords = {
+    #    name: ((name,), bin_center, a.attrs)
+    #    for name, bin_center, a in zip(new_dims, bin_centers, args)
+    #}
+    # Adjust bin coords to return positions of bin centres rather than bin edges
+    def _find_centers(da, dim):
+        return 0.5 * (da.isel(**{dim: slice(None, -1, None)})
+                      + da.isel(**{dim: slice(1, None, None)}))
+
+    bin_centers = [
+        _find_centers(bin, new_bin_dim) for bin, new_bin_dim in zip(aligned_bins, new_dims)
+    ]
+
 
     # old coords associated with dims
     old_dim_coords = {name: a0[name] for name in dims_to_keep if name in a_coords}
 
     all_coords = {}
     all_coords.update(old_dim_coords)
-    all_coords.update(new_coords)
+    print(bin_centers)
+    all_coords.update({b.name: b for b in bin_centers})
     # add compatible coords
     if keep_coords:
         for c in a_coords:
@@ -214,6 +232,11 @@ def histogram(
 
     output_name = "_".join(["histogram"] + [a.name for a in args[:N_args]])
 
+    print(h_data)
+    print(h_data.shape)
+    print(output_dims)
+    print(all_coords)
+    print(output_name)
     da_out = xr.DataArray(h_data, dims=output_dims, coords=all_coords, name=output_name)
 
     if density:

From 142c9bcf4d60b9a9b8e39c264147117b7a019dc9 Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <tegn500@york.ac.uk>
Date: Mon, 7 Jun 2021 11:32:27 -0400
Subject: [PATCH 16/21] output bin coordinates correctly

---
 xhistogram/xarray.py | 46 ++++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/xhistogram/xarray.py b/xhistogram/xarray.py
index 1c6f081..9862706 100644
--- a/xhistogram/xarray.py
+++ b/xhistogram/xarray.py
@@ -171,52 +171,48 @@ def histogram(
     new_dims = [a.name + bin_dim_suffix for a in args[:N_args]]
     output_dims = dims_to_keep + new_dims
 
-    # align bins if DataArrays
-    # Need to align so that the var_bins dim is last, similar to the reduce dims on the data
-    # TODO check correct dimensions exist
-    # Drop dimensions that will be reduced along before aligning bins
-    if dim is None:
-        output_shape = a0
-    else:
-        output_shape = a0.isel(**{d: 0 for d in dim}, drop=True)
-    aligned_bins = []
+    # Create bin coordinates
+    bin_coords = []
     for bin, new_dim in zip(bins, new_dims):
         if isinstance(bin, xr.DataArray):
-            aligned_bin, _ = xr.align(bin, output_shape, join="exact")
-            if new_dim not in aligned_bin.dims:
-                raise ValueError(f"bins DataArray does not contain dimension {new_dim}")
-            aligned_bin = aligned_bin.transpose(new_dim, ...)
-            aligned_bins.append(aligned_bin)
-            print(aligned_bin)
+            # align bins if already DataArrays
+            # Drop dimensions that will be reduced along before aligning bins
+            if dim is None:
+                output_shape = a0
+            else:
+                output_shape = a0.isel(**{d: 0 for d in dim}, drop=True)
+            aligned_bin_coord, _ = xr.align(bin, output_shape, join="exact")
+
+            # TODO check correct dimensions exist
+            if new_dim not in aligned_bin_coord.dims:
+                raise ValueError(
+                    f"bins DataArray does not contain dimension {new_dim}")
+
+            # Need to align so that the var_bins dim is last, similar to the reduce dims on the data
+            bin_coord = aligned_bin_coord.transpose(new_dim, ...)
         else:
-            aligned_bins.append(bin)
+            bin_coord = xr.DataArray(bin, name=new_dim, dims=(new_dim,), attrs=a0.attrs)
+        bin_coords.append(bin_coord)
 
     h_data, bins = _histogram(
         *args_data,
         weights=weights_data,
-        bins=[b.values for b in aligned_bins],
+        bins=[b.values for b in bin_coords],
         range=range,
         axis=axis,
         density=density,
         block_size=block_size,
     )
 
-    # create new coords
-    #bin_centers = [0.5 * (bin[:-1] + bin[1:]) for bin in bins]
-    #new_coords = {
-    #    name: ((name,), bin_center, a.attrs)
-    #    for name, bin_center, a in zip(new_dims, bin_centers, args)
-    #}
     # Adjust bin coords to return positions of bin centres rather than bin edges
     def _find_centers(da, dim):
         return 0.5 * (da.isel(**{dim: slice(None, -1, None)})
                       + da.isel(**{dim: slice(1, None, None)}))
 
     bin_centers = [
-        _find_centers(bin, new_bin_dim) for bin, new_bin_dim in zip(aligned_bins, new_dims)
+        _find_centers(bin, new_bin_dim) for bin, new_bin_dim in zip(bin_coords, new_dims)
     ]
 
-
     # old coords associated with dims
     old_dim_coords = {name: a0[name] for name in dims_to_keep if name in a_coords}
 

From 3efedd14e2a96c92e15864a99565e139f686e525 Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <tegn500@york.ac.uk>
Date: Mon, 7 Jun 2021 13:28:12 -0400
Subject: [PATCH 17/21] passes weights tests again

---
 xhistogram/core.py | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 9c36bcd..21e7d78 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -14,6 +14,7 @@
     ravel_multi_index,
     concatenate,
     broadcast_arrays,
+    broadcast_to,
 )
 
 # range is a keyword so save the builtin so they can use it.
@@ -152,6 +153,11 @@ def _bincount_2d_vectorized(
 
     nrows, ncols = a0.shape
 
+    #bins = [np.expand_dims(b, axis=0) if b.ndim == 1 else b for b in bins]
+
+    # TODO assuming all bins have same form here
+    b0 = bins[0]
+
     # a marginally faster implementation would be to use searchsorted,
     # like numpy histogram itself does
     # https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L864-L882
@@ -162,7 +168,12 @@ def _bincount_2d_vectorized(
     # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/calibration.py#L592
     # but a better approach would be to use something like _search_sorted_inclusive() in
     # numpy histogram. This is an additional motivation for moving to searchsorted
-    #bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]
+    # TODO wouldn't need ifs if we just promoted all bins to 2D
+
+    if b0.ndim == 1:
+        bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]
+    elif b0.ndim == 2:
+        bins = [np.concatenate((b[:, :-1], b[:, -1:] + 1e-8), axis=1) for b in bins]
 
     # the maximum possible value of of digitize is nbins
     # for right=False:
@@ -170,14 +181,12 @@ def _bincount_2d_vectorized(
     #   - i corresponds to bins[i-1] <= a < b[i]
     #   - nbins corresponds to a a >= b[1]
 
-    # TODO assuming all bins have same form here
-    b = bins[0]
-    if b.ndim == 1:
+    if b0.ndim == 1:
         nbins = [len(b) for b in bins]
         hist_shapes = [nb + 1 for nb in nbins]
 
         each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
-    elif b.ndim == 2:
+    elif b0.ndim == 2:
         nbins = [b.shape[1] for b in bins]
         hist_shapes = [nb + 1 for nb in nbins]
 
@@ -211,16 +220,18 @@ def _bincount_2d_vectorized(
     return bin_counts
 
 
-def _bincount(*all_arrays, weights=False, axis=None, density=None):
+def _bincount(*all_arrays, weights=False, axis=None, density=False):
+
+    all_arrays = list(all_arrays)
+
+    weights_array = all_arrays.pop()
 
     # TODO a more robust way to pass the bins together with the arrays
     n_args = len(all_arrays) // 2
     arrays = all_arrays[:n_args]
-    print(arrays[0].shape)
     bins = all_arrays[n_args:]
-    print(bins[0].shape)
 
-    # is this necessary?
+    # is this necessary? (it is necessary for the weights to match the data)
     all_arrays_broadcast = broadcast_arrays(*arrays)
 
     a0 = all_arrays_broadcast[0]
@@ -249,19 +260,24 @@ def reshape_input(a):
             dims_1 = c.shape[split_idx:]
             new_dim_0 = np.prod(dims_0)
             new_dim_1 = np.prod(dims_1)
+            # TODO integer vs float logic here is not robust
             d = reshape(c, (new_dim_0, new_dim_1))
         return d
 
     all_arrays_reshaped = [reshape_input(a) for a in all_arrays_broadcast]
-    bins_reshaped = [reshape_input(b) for b in bins]
 
+    if any(b.ndim > 1 for b in bins):
+        bins_reshaped = [reshape_input(b) for b in bins]
+    else:
+        bins_reshaped = bins
     if weights:
-        weights_array = all_arrays_reshaped.pop()
+        weights_broadcast = broadcast_to(weights_array, a0.shape)
+        weights_reshaped = reshape_input(weights_broadcast)
     else:
-        weights_array = None
+        weights_reshaped = None
 
     bin_counts = _bincount_2d_vectorized(
-        *all_arrays_reshaped, bins=bins_reshaped, weights=weights_array
+        *all_arrays_reshaped, bins=bins_reshaped, weights=weights_reshaped
     )
 
     final_shape = kept_axes_shape + bin_counts.shape[1:]
@@ -377,8 +393,8 @@ def histogram(
     all_arrays = list(args)
     n_inputs = len(all_arrays)
 
+    # TODO make feeding weights in less janky
     if weights is not None:
-        all_arrays.append(weights)
         has_weights = True
     else:
         has_weights = False
@@ -466,7 +482,7 @@ def histogram(
         bin_counts = bin_counts.sum(drop_axes)
     else:
         # drop the extra axis used for summing over blocks
-        bin_counts = _bincount(*(all_arrays + bins), **bincount_kwargs).squeeze(drop_axes)
+        bin_counts = _bincount(*(all_arrays + bins + [weights]), **bincount_kwargs).squeeze(drop_axes)
 
     if density:
         # Normalise by dividing by bin counts and areas such that all the

From 14f38a9b3d5bf2dd71aa0f95464a5eb5ca6afec0 Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <tegn500@york.ac.uk>
Date: Mon, 7 Jun 2021 14:23:37 -0400
Subject: [PATCH 18/21] removed debugging stuff

---
 xhistogram/test/test_xarray.py | 5 -----
 xhistogram/xarray.py           | 6 ------
 2 files changed, 11 deletions(-)

diff --git a/xhistogram/test/test_xarray.py b/xhistogram/test/test_xarray.py
index f022171..ee18b4a 100644
--- a/xhistogram/test/test_xarray.py
+++ b/xhistogram/test/test_xarray.py
@@ -254,11 +254,6 @@ def _np_hist(*args, **kwargs):
             h, _ = np.histogram(*args, **kwargs)
             return h
 
-        #hist = np.apply_along_axis(
-        #    _np_hist, axis=0, arr=data_a.values,
-        #    bins=[bins_a]
-        #)
-
 
         hist = np.stack([_np_hist(data_a.values[0, :], bins=bins_a[0, :]),
                          _np_hist(data_a.values[1, :], bins=bins_a[1, :])])
diff --git a/xhistogram/xarray.py b/xhistogram/xarray.py
index 9862706..371cd27 100644
--- a/xhistogram/xarray.py
+++ b/xhistogram/xarray.py
@@ -218,7 +218,6 @@ def _find_centers(da, dim):
 
     all_coords = {}
     all_coords.update(old_dim_coords)
-    print(bin_centers)
     all_coords.update({b.name: b for b in bin_centers})
     # add compatible coords
     if keep_coords:
@@ -228,11 +227,6 @@ def _find_centers(da, dim):
 
     output_name = "_".join(["histogram"] + [a.name for a in args[:N_args]])
 
-    print(h_data)
-    print(h_data.shape)
-    print(output_dims)
-    print(all_coords)
-    print(output_name)
     da_out = xr.DataArray(h_data, dims=output_dims, coords=all_coords, name=output_name)
 
     if density:

From f79b5e05ad1acf1d799330fe56d37c4ee39bf39e Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <tegn500@york.ac.uk>
Date: Mon, 7 Jun 2021 15:22:06 -0400
Subject: [PATCH 19/21] check for superfluous dimensions on bins

---
 xhistogram/test/test_xarray.py | 2 +-
 xhistogram/xarray.py           | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/xhistogram/test/test_xarray.py b/xhistogram/test/test_xarray.py
index ee18b4a..dace559 100644
--- a/xhistogram/test/test_xarray.py
+++ b/xhistogram/test/test_xarray.py
@@ -213,7 +213,7 @@ class TestMultiDimensionalBins:
     def test_bin_dataarrays_with_extra_dims(self):
         data = xr.DataArray([0], dims=["x"], name="a")
         bins = xr.DataArray([[1]], dims=["bad", "a_bin"])
-        with pytest.raises(ValueError, match="will not be broadcast"):
+        with pytest.raises(ValueError, match="not present in data"):
             histogram(data, dim="x", bins=[bins])
 
     def test_bin_dataarrays_without_reduce_dim(self):
diff --git a/xhistogram/xarray.py b/xhistogram/xarray.py
index 371cd27..1a28e85 100644
--- a/xhistogram/xarray.py
+++ b/xhistogram/xarray.py
@@ -183,10 +183,12 @@ def histogram(
                 output_shape = a0.isel(**{d: 0 for d in dim}, drop=True)
             aligned_bin_coord, _ = xr.align(bin, output_shape, join="exact")
 
-            # TODO check correct dimensions exist
+            # check correct dimensions exist
             if new_dim not in aligned_bin_coord.dims:
                 raise ValueError(
                     f"bins DataArray does not contain dimension {new_dim}")
+            if any(d not in output_dims for d in aligned_bin_coord.dims):
+                raise ValueError("dimensions present in bins not present in data")
 
             # Need to align so that the var_bins dim is last, similar to the reduce dims on the data
             bin_coord = aligned_bin_coord.transpose(new_dim, ...)

From 583a85898541e229b729ce277f875f165cb48710 Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <tegn500@york.ac.uk>
Date: Tue, 8 Jun 2021 13:36:53 -0400
Subject: [PATCH 20/21] passes dask tests but actually broken in general

---
 xhistogram/core.py           | 7 +++++--
 xhistogram/test/test_core.py | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 21e7d78..3d6411d 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -415,14 +415,16 @@ def histogram(
     # to write a version of this that doesn't trigger when `range` is provided, but
     # for now let's just use np.histogram_bin_edges
     if is_dask_array:
-        if not all([isinstance(b, np.ndarray) for b in bins]):
+        if not all([isinstance(b, np.ndarray) for b in formatted_bins]):
             raise TypeError(
                 "When using dask arrays, bins must be provided as numpy array(s) of edges"
             )
+        bins = formatted_bins
     else:
         bins = []
         for a, b, r in zip(all_arrays, formatted_bins, range):
             if isinstance(b, np.ndarray):
+                # account for possibility that bins is a >1d numpy array
                 pass
             else:
                 b = np.histogram_bin_edges(a, b, r)
@@ -464,7 +466,8 @@ def histogram(
             blockwise_args.append(input_index)
 
         # Bins arrays do not contain axes which will get reduced along
-        bins_input_index = tuple(ii for ii in out_index if ii not in drop_axes)
+        # TODO incorrect for >1D bins - how do we know what broadcast axes are on bins here?
+        bins_input_index = tuple(new_axes.keys())
         for b in bins:
             blockwise_args.append(b)
             blockwise_args.append(bins_input_index)
diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py
index e0df0ee..eb58d97 100644
--- a/xhistogram/test/test_core.py
+++ b/xhistogram/test/test_core.py
@@ -182,6 +182,7 @@ def test_histogram_results_3d_density():
     np.testing.assert_allclose(integral, 1.0)
 
 
+# TODO parametrize this over axes so there is only one assert per test
 @pytest.mark.parametrize("block_size", [None, 5, "auto"])
 @pytest.mark.parametrize("use_dask", [False, True])
 def test_histogram_shape(use_dask, block_size):

From a9ec2b7d74bac49d541429d274d95de0caf29b9c Mon Sep 17 00:00:00 2001
From: Thomas Nicholas <tegn500@york.ac.uk>
Date: Tue, 8 Jun 2021 15:45:27 -0400
Subject: [PATCH 21/21] WIP attempt to internally broadcast bins

---
 xhistogram/core.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 3d6411d..8815ab4 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -431,6 +431,12 @@ def histogram(
             bins.append(b)
     bincount_kwargs = dict(weights=has_weights, axis=axis, density=density)
 
+    # broadcast bins to match reduced data
+    # TODO assumes all bins arguments have the same shape
+    b0 = bins[0]
+    broadcast_bins_shape = [ii for ii in a0.shape if ii not in axis] + list(b0.shape)
+    bins = [np.broadcast_to(b, broadcast_bins_shape) for b in bins]
+
     # keep these axes in the inputs
     if axis is not None:
         drop_axes = tuple([ii for ii in input_index if ii in axis])