From c877978755b5ef568489bc4294c87b16d7c6c458 Mon Sep 17 00:00:00 2001
From: dougiesquire <dougiesquire@gmail.com>
Date: Sat, 10 Apr 2021 17:29:51 +1000
Subject: [PATCH 1/6] refactor approach for last bin RH inclusive

---
 xhistogram/core.py           | 33 ++++++++++++++++-----------------
 xhistogram/duck_array_ops.py |  1 +
 xhistogram/test/test_core.py | 21 +++++++++++++++++++++
 3 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 819b7fb..c495a1b 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -6,6 +6,7 @@
 import numpy as np
 from functools import reduce
 from .duck_array_ops import (
+    where,
     digitize,
     bincount,
     reshape,
@@ -108,24 +109,22 @@ def _histogram_2d_vectorized(
     nbins = [len(b) for b in bins]
     hist_shapes = [nb + 1 for nb in nbins]
 
-    # a marginally faster implementation would be to use searchsorted,
-    # like numpy histogram itself does
-    # https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L864-L882
-    # for now we stick with `digitize` because it's easy to understand how it works
-
-    # Add small increment to the last bin edge to make the final bin right-edge inclusive
-    # Note, this is the approach taken by sklearn, e.g.
-    # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/calibration.py#L592
-    # but a better approach would be to use something like _search_sorted_inclusive() in
-    # numpy histogram. This is an additional motivation for moving to searchsorted
-    bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]
-
-    # the maximum possible value of of digitize is nbins
-    # for right=False:
+    # The maximum possible value of of digitize is nbins
+    # for _digitize_inclusive:
     #   - 0 corresponds to a < b[0]
-    #   - i corresponds to bins[i-1] <= a < b[i]
-    #   - nbins corresponds to a a >= b[1]
-    each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
+    #   - i corresponds to b[i-1] <= a < b[i]
+    #   - nbins-1 corresponds to b[-2] <= a <= b[-1]
+    #   - nbins corresponds to a >= b[-1]
+    def _digitize_inclusive(a, b):
+        """Like `digitize`, but where the last bin is also right-edge inclusive."""
+        # Similar to implementation in np.histogramdd
+        # see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056
+        bin_indices = digitize(a, b)
+        on_edge = a == b[-1]
+        # Need to use `where` here rather the simple indexing for dask compatibility
+        return where(on_edge, bin_indices - 1, bin_indices)
+
+    each_bin_indices = [_digitize_inclusive(a, b) for a, b in zip(args, bins)]
     # product of the bins gives the joint distribution
     if N_inputs > 1:
         bin_indices = ravel_multi_index(each_bin_indices, hist_shapes)
diff --git a/xhistogram/duck_array_ops.py b/xhistogram/duck_array_ops.py
index b9e632e..552ddab 100644
--- a/xhistogram/duck_array_ops.py
+++ b/xhistogram/duck_array_ops.py
@@ -31,6 +31,7 @@ def f(*args, **kwargs):
     return f
 
 
+where = _dask_or_eager_func("where")
 digitize = _dask_or_eager_func("digitize")
 bincount = _dask_or_eager_func("bincount")
 reshape = _dask_or_eager_func("reshape")
diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py
index eef22dd..f804311 100644
--- a/xhistogram/test/test_core.py
+++ b/xhistogram/test/test_core.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 
 from itertools import combinations
 import dask.array as dsa
@@ -202,3 +203,23 @@ def test_histogram_shape(use_dask, block_size):
         assert c.shape == expected_shape
         if use_dask:
             assert isinstance(c, dsa.Array)
+
+
+@pytest.mark.parametrize("block_size", [None, 1, 2])
+@pytest.mark.parametrize("use_dask", [False, True])
+def test_histogram_results_datetime(use_dask, block_size):
+    """Test computing histogram of datetime objects"""
+    data = pd.date_range(start="2000-06-01", periods=5)
+    if use_dask:
+        data = dsa.asarray(data, chunks=(5,))
+    # everything should be in the second bin (index 1)
+    bins = np.array(
+        [
+            np.datetime64("1999-01-01"),
+            np.datetime64("2000-01-01"),
+            np.datetime64("2001-01-01"),
+        ]
+    )
+    h = histogram(data, bins=bins, block_size=block_size)
+    expected = np.histogram(data, bins=bins)[0]
+    np.testing.assert_allclose(h, expected)

From 9cc290deecdbf1b925c904e9e7d3814a863145c1 Mon Sep 17 00:00:00 2001
From: dougiesquire <dougiesquire@gmail.com>
Date: Sun, 2 May 2021 14:03:30 +1000
Subject: [PATCH 2/6] remove test with datetime objects

---
 xhistogram/test/test_core.py | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py
index f804311..eef22dd 100644
--- a/xhistogram/test/test_core.py
+++ b/xhistogram/test/test_core.py
@@ -1,5 +1,4 @@
 import numpy as np
-import pandas as pd
 
 from itertools import combinations
 import dask.array as dsa
@@ -203,23 +202,3 @@ def test_histogram_shape(use_dask, block_size):
         assert c.shape == expected_shape
         if use_dask:
             assert isinstance(c, dsa.Array)
-
-
-@pytest.mark.parametrize("block_size", [None, 1, 2])
-@pytest.mark.parametrize("use_dask", [False, True])
-def test_histogram_results_datetime(use_dask, block_size):
-    """Test computing histogram of datetime objects"""
-    data = pd.date_range(start="2000-06-01", periods=5)
-    if use_dask:
-        data = dsa.asarray(data, chunks=(5,))
-    # everything should be in the second bin (index 1)
-    bins = np.array(
-        [
-            np.datetime64("1999-01-01"),
-            np.datetime64("2000-01-01"),
-            np.datetime64("2001-01-01"),
-        ]
-    )
-    h = histogram(data, bins=bins, block_size=block_size)
-    expected = np.histogram(data, bins=bins)[0]
-    np.testing.assert_allclose(h, expected)

From 3b9ca32f64ffe58d79c391aa72fff52723cd002e Mon Sep 17 00:00:00 2001
From: dougiesquire <dougiesquire@gmail.com>
Date: Tue, 15 Jun 2021 11:58:18 +1000
Subject: [PATCH 3/6] pre-commit

---
 xhistogram/core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 56a9458..29f1589 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -166,8 +166,8 @@ def _digitize_inclusive(a, b):
         # see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056
         bin_indices = digitize(a, b)
         on_edge = a == b[-1]
-        # Need to use `where` here rather the simple indexing for dask compatibility
-        return bin_indices[on_edge] -= 1
+        bin_indices[on_edge] -= 1
+        return bin_indices
 
     each_bin_indices = [_digitize_inclusive(a, b) for a, b in zip(args, bins)]
     # product of the bins gives the joint distribution

From b4ecbd32b717af04c9cd8828c61368a9d6858ef7 Mon Sep 17 00:00:00 2001
From: dougiesquire <dougiesquire@gmail.com>
Date: Tue, 15 Jun 2021 13:19:53 +1000
Subject: [PATCH 4/6] add back test with datetime objects

---
 xhistogram/core.py           |  4 ++--
 xhistogram/test/test_core.py | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 29f1589..edf5765 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -154,8 +154,8 @@ def _bincount_2d_vectorized(
     nbins = [len(b) for b in bins]
     hist_shapes = [nb + 1 for nb in nbins]
 
-    # The maximum possible value of of digitize is nbins
-    # for _digitize_inclusive:
+    # The maximum possible value of digitize is nbins
+    # For _digitize_inclusive:
     #   - 0 corresponds to a < b[0]
     #   - i corresponds to b[i-1] <= a < b[i]
     #   - nbins-1 corresponds to b[-2] <= a <= b[-1]
diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py
index f6ebcc3..d615b68 100644
--- a/xhistogram/test/test_core.py
+++ b/xhistogram/test/test_core.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 
 from itertools import combinations
 import dask.array as dsa
@@ -313,3 +314,23 @@ def test_ensure_correctly_formatted_range(in_out):
     else:
         with pytest.raises(ValueError):
             _ensure_correctly_formatted_range(range_in, n)
+
+
+@pytest.mark.parametrize("block_size", [None, 1, 2])
+@pytest.mark.parametrize("use_dask", [False, True])
+def test_histogram_results_datetime(use_dask, block_size):
+    """Test computing histogram of datetime objects"""
+    data = pd.date_range(start="2000-06-01", periods=5)
+    if use_dask:
+        data = dsa.asarray(data, chunks=(5,))
+    # everything should be in the second bin (index 1)
+    bins = np.array(
+        [
+            np.datetime64("1999-01-01"),
+            np.datetime64("2000-01-01"),
+            np.datetime64("2001-01-01"),
+        ]
+    )
+    h = histogram(data, bins=bins, block_size=block_size)
+    expected = np.histogram(data, bins=bins)[0]
+    np.testing.assert_allclose(h, expected)

From 67d32f4a01d0927732d47383c6cc3fc44fa2a0ba Mon Sep 17 00:00:00 2001
From: dougiesquire <dougiesquire@gmail.com>
Date: Wed, 16 Jun 2021 16:39:08 +1000
Subject: [PATCH 5/6] switch to searchsorted

---
 xhistogram/core.py           | 19 +++++++++++--------
 xhistogram/test/test_core.py |  2 +-
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index edf5765..66279f3 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -8,7 +8,7 @@
 from functools import reduce
 from collections.abc import Iterable
 from numpy import (
-    digitize,
+    searchsorted,
     bincount,
     reshape,
     ravel_multi_index,
@@ -154,22 +154,25 @@ def _bincount_2d_vectorized(
     nbins = [len(b) for b in bins]
     hist_shapes = [nb + 1 for nb in nbins]
 
-    # The maximum possible value of digitize is nbins
-    # For _digitize_inclusive:
+    # The maximum possible value of searchsorted is nbins
+    # For _searchsorted_inclusive:
     #   - 0 corresponds to a < b[0]
     #   - i corresponds to b[i-1] <= a < b[i]
     #   - nbins-1 corresponds to b[-2] <= a <= b[-1]
     #   - nbins corresponds to a >= b[-1]
-    def _digitize_inclusive(a, b):
-        """Like `digitize`, but where the last bin is also right-edge inclusive."""
+    def _searchsorted_inclusive(a, b):
+        """
+        Like `searchsorted`, but where the last bin is also right-edge inclusive.
+        """
         # Similar to implementation in np.histogramdd
         # see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056
-        bin_indices = digitize(a, b)
+        bin_indices = searchsorted(b, a, side="right")
         on_edge = a == b[-1]
+        # Shift these points one bin to the left.
         bin_indices[on_edge] -= 1
         return bin_indices
 
-    each_bin_indices = [_digitize_inclusive(a, b) for a, b in zip(args, bins)]
+    each_bin_indices = [_searchsorted_inclusive(a, b) for a, b in zip(args, bins)]
     # product of the bins gives the joint distribution
     if N_inputs > 1:
         bin_indices = ravel_multi_index(each_bin_indices, hist_shapes)
@@ -319,7 +322,7 @@ def histogram(
 
     See Also
     --------
-    numpy.histogram, numpy.bincount, numpy.digitize
+    numpy.histogram, numpy.bincount, numpy.searchsorted
     """
 
     a0 = args[0]
diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py
index d615b68..7851c0f 100644
--- a/xhistogram/test/test_core.py
+++ b/xhistogram/test/test_core.py
@@ -331,6 +331,6 @@ def test_histogram_results_datetime(use_dask, block_size):
             np.datetime64("2001-01-01"),
         ]
     )
-    h = histogram(data, bins=bins, block_size=block_size)
+    h = histogram(data, bins=bins, block_size=block_size)[0]
     expected = np.histogram(data, bins=bins)[0]
     np.testing.assert_allclose(h, expected)

From 4fdd716ab4e090108a6b99d62fc1c4237834a921 Mon Sep 17 00:00:00 2001
From: dougiesquire <dougiesquire@gmail.com>
Date: Tue, 22 Jun 2021 10:10:37 +1000
Subject: [PATCH 6/6] add note about sorted bins

---
 xhistogram/core.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xhistogram/core.py b/xhistogram/core.py
index 66279f3..0f66c81 100644
--- a/xhistogram/core.py
+++ b/xhistogram/core.py
@@ -166,6 +166,7 @@ def _searchsorted_inclusive(a, b):
         """
         # Similar to implementation in np.histogramdd
         # see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056
+        # This assumes the bins (b) are sorted
         bin_indices = searchsorted(b, a, side="right")
         on_edge = a == b[-1]
         # Shift these points one bin to the left.