diff --git a/xhistogram/core.py b/xhistogram/core.py index c13dda9..0f66c81 100644 --- a/xhistogram/core.py +++ b/xhistogram/core.py @@ -8,7 +8,7 @@ from functools import reduce from collections.abc import Iterable from numpy import ( - digitize, + searchsorted, bincount, reshape, ravel_multi_index, @@ -154,24 +154,26 @@ def _bincount_2d_vectorized( nbins = [len(b) for b in bins] hist_shapes = [nb + 1 for nb in nbins] - # a marginally faster implementation would be to use searchsorted, - # like numpy histogram itself does - # https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L864-L882 - # for now we stick with `digitize` because it's easy to understand how it works - - # Add small increment to the last bin edge to make the final bin right-edge inclusive - # Note, this is the approach taken by sklearn, e.g. - # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/calibration.py#L592 - # but a better approach would be to use something like _search_sorted_inclusive() in - # numpy histogram. This is an additional motivation for moving to searchsorted - bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins] - - # the maximum possible value of of digitize is nbins - # for right=False: + # The maximum possible value of searchsorted is nbins + # For _searchsorted_inclusive: # - 0 corresponds to a < b[0] - # - i corresponds to bins[i-1] <= a < b[i] - # - nbins corresponds to a a >= b[1] - each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)] + # - i corresponds to b[i-1] <= a < b[i] + # - nbins-1 corresponds to b[-2] <= a <= b[-1] + # - nbins corresponds to a >= b[-1] + def _searchsorted_inclusive(a, b): + """ + Like `searchsorted`, but where the last bin is also right-edge inclusive. + """ + # Similar to implementation in np.histogramdd + # see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056 + # This assumes the bins (b) are sorted + bin_indices = searchsorted(b, a, side="right") + on_edge = a == b[-1] + # Shift these points one bin to the left. + bin_indices[on_edge] -= 1 + return bin_indices + + each_bin_indices = [_searchsorted_inclusive(a, b) for a, b in zip(args, bins)] # product of the bins gives the joint distribution if N_inputs > 1: bin_indices = ravel_multi_index(each_bin_indices, hist_shapes) @@ -321,7 +323,7 @@ def histogram( See Also -------- - numpy.histogram, numpy.bincount, numpy.digitize + numpy.histogram, numpy.bincount, numpy.searchsorted """ a0 = args[0] diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py index f6ebcc3..7851c0f 100644 --- a/xhistogram/test/test_core.py +++ b/xhistogram/test/test_core.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from itertools import combinations import dask.array as dsa @@ -313,3 +314,23 @@ def test_ensure_correctly_formatted_range(in_out): else: with pytest.raises(ValueError): _ensure_correctly_formatted_range(range_in, n) + + +@pytest.mark.parametrize("block_size", [None, 1, 2]) +@pytest.mark.parametrize("use_dask", [False, True]) +def test_histogram_results_datetime(use_dask, block_size): + """Test computing histogram of datetime objects""" + data = pd.date_range(start="2000-06-01", periods=5) + if use_dask: + data = dsa.asarray(data, chunks=(5,)) + # everything should be in the second bin (index 1) + bins = np.array( + [ + np.datetime64("1999-01-01"), + np.datetime64("2000-01-01"), + np.datetime64("2001-01-01"), + ] + ) + h = histogram(data, bins=bins, block_size=block_size)[0] + expected = np.histogram(data, bins=bins)[0] + np.testing.assert_allclose(h, expected)