From c877978755b5ef568489bc4294c87b16d7c6c458 Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Sat, 10 Apr 2021 17:29:51 +1000 Subject: [PATCH 1/6] refactor approach for last bin RH inclusive --- xhistogram/core.py | 33 ++++++++++++++++----------------- xhistogram/duck_array_ops.py | 1 + xhistogram/test/test_core.py | 21 +++++++++++++++++++++ 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/xhistogram/core.py b/xhistogram/core.py index 819b7fb..c495a1b 100644 --- a/xhistogram/core.py +++ b/xhistogram/core.py @@ -6,6 +6,7 @@ import numpy as np from functools import reduce from .duck_array_ops import ( + where, digitize, bincount, reshape, @@ -108,24 +109,22 @@ def _histogram_2d_vectorized( nbins = [len(b) for b in bins] hist_shapes = [nb + 1 for nb in nbins] - # a marginally faster implementation would be to use searchsorted, - # like numpy histogram itself does - # https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L864-L882 - # for now we stick with `digitize` because it's easy to understand how it works - - # Add small increment to the last bin edge to make the final bin right-edge inclusive - # Note, this is the approach taken by sklearn, e.g. - # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/calibration.py#L592 - # but a better approach would be to use something like _search_sorted_inclusive() in - # numpy histogram. This is an additional motivation for moving to searchsorted - bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins] - - # the maximum possible value of of digitize is nbins - # for right=False: + # The maximum possible value of of digitize is nbins + # for _digitize_inclusive: # - 0 corresponds to a < b[0] - # - i corresponds to bins[i-1] <= a < b[i] - # - nbins corresponds to a a >= b[1] - each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)] + # - i corresponds to b[i-1] <= a < b[i] + # - nbins-1 corresponds to b[-2] <= a <= b[-1] + # - nbins corresponds to a >= b[-1] + def _digitize_inclusive(a, b): + """Like `digitize`, but where the last bin is also right-edge inclusive.""" + # Similar to implementation in np.histogramdd + # see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056 + bin_indices = digitize(a, b) + on_edge = a == b[-1] + # Need to use `where` here rather the simple indexing for dask compatibility + return where(on_edge, bin_indices - 1, bin_indices) + + each_bin_indices = [_digitize_inclusive(a, b) for a, b in zip(args, bins)] # product of the bins gives the joint distribution if N_inputs > 1: bin_indices = ravel_multi_index(each_bin_indices, hist_shapes) diff --git a/xhistogram/duck_array_ops.py b/xhistogram/duck_array_ops.py index b9e632e..552ddab 100644 --- a/xhistogram/duck_array_ops.py +++ b/xhistogram/duck_array_ops.py @@ -31,6 +31,7 @@ def f(*args, **kwargs): return f +where = _dask_or_eager_func("where") digitize = _dask_or_eager_func("digitize") bincount = _dask_or_eager_func("bincount") reshape = _dask_or_eager_func("reshape") diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py index eef22dd..f804311 100644 --- a/xhistogram/test/test_core.py +++ b/xhistogram/test/test_core.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from itertools import combinations import dask.array as dsa @@ -202,3 +203,23 @@ def test_histogram_shape(use_dask, block_size): assert c.shape == expected_shape if use_dask: assert isinstance(c, dsa.Array) + + +@pytest.mark.parametrize("block_size", [None, 1, 2]) +@pytest.mark.parametrize("use_dask", [False, True]) +def test_histogram_results_datetime(use_dask, block_size): + """Test computing histogram of datetime objects""" + data = pd.date_range(start="2000-06-01", periods=5) + if use_dask: + data = dsa.asarray(data, chunks=(5,)) + # everything should be in the second bin (index 1) + bins = np.array( + [ + np.datetime64("1999-01-01"), + np.datetime64("2000-01-01"), + np.datetime64("2001-01-01"), + ] + ) + h = histogram(data, bins=bins, block_size=block_size) + expected = np.histogram(data, bins=bins)[0] + np.testing.assert_allclose(h, expected) From 9cc290deecdbf1b925c904e9e7d3814a863145c1 Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Sun, 2 May 2021 14:03:30 +1000 Subject: [PATCH 2/6] remove test with datetime objects --- xhistogram/test/test_core.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py index f804311..eef22dd 100644 --- a/xhistogram/test/test_core.py +++ b/xhistogram/test/test_core.py @@ -1,5 +1,4 @@ import numpy as np -import pandas as pd from itertools import combinations import dask.array as dsa @@ -203,23 +202,3 @@ def test_histogram_shape(use_dask, block_size): assert c.shape == expected_shape if use_dask: assert isinstance(c, dsa.Array) - - -@pytest.mark.parametrize("block_size", [None, 1, 2]) -@pytest.mark.parametrize("use_dask", [False, True]) -def test_histogram_results_datetime(use_dask, block_size): - """Test computing histogram of datetime objects""" - data = pd.date_range(start="2000-06-01", periods=5) - if use_dask: - data = dsa.asarray(data, chunks=(5,)) - # everything should be in the second bin (index 1) - bins = np.array( - [ - np.datetime64("1999-01-01"), - np.datetime64("2000-01-01"), - np.datetime64("2001-01-01"), - ] - ) - h = histogram(data, bins=bins, block_size=block_size) - expected = np.histogram(data, bins=bins)[0] - np.testing.assert_allclose(h, expected) From 3b9ca32f64ffe58d79c391aa72fff52723cd002e Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Tue, 15 Jun 2021 11:58:18 +1000 Subject: [PATCH 3/6] pre-commit --- xhistogram/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xhistogram/core.py b/xhistogram/core.py index 56a9458..29f1589 100644 --- a/xhistogram/core.py +++ b/xhistogram/core.py @@ -166,8 +166,8 @@ def _digitize_inclusive(a, b): # see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056 bin_indices = digitize(a, b) on_edge = a == b[-1] - # Need to use `where` here rather the simple indexing for dask compatibility - return bin_indices[on_edge] -= 1 + bin_indices[on_edge] -= 1 + return bin_indices each_bin_indices = [_digitize_inclusive(a, b) for a, b in zip(args, bins)] # product of the bins gives the joint distribution From b4ecbd32b717af04c9cd8828c61368a9d6858ef7 Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Tue, 15 Jun 2021 13:19:53 +1000 Subject: [PATCH 4/6] add back test with datetime objects --- xhistogram/core.py | 4 ++-- xhistogram/test/test_core.py | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/xhistogram/core.py b/xhistogram/core.py index 29f1589..edf5765 100644 --- a/xhistogram/core.py +++ b/xhistogram/core.py @@ -154,8 +154,8 @@ def _bincount_2d_vectorized( nbins = [len(b) for b in bins] hist_shapes = [nb + 1 for nb in nbins] - # The maximum possible value of of digitize is nbins - # for _digitize_inclusive: + # The maximum possible value of digitize is nbins + # For _digitize_inclusive: # - 0 corresponds to a < b[0] # - i corresponds to b[i-1] <= a < b[i] # - nbins-1 corresponds to b[-2] <= a <= b[-1] diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py index f6ebcc3..d615b68 100644 --- a/xhistogram/test/test_core.py +++ b/xhistogram/test/test_core.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from itertools import combinations import dask.array as dsa @@ -313,3 +314,23 @@ def test_ensure_correctly_formatted_range(in_out): else: with pytest.raises(ValueError): _ensure_correctly_formatted_range(range_in, n) + + +@pytest.mark.parametrize("block_size", [None, 1, 2]) +@pytest.mark.parametrize("use_dask", [False, True]) +def test_histogram_results_datetime(use_dask, block_size): + """Test computing histogram of datetime objects""" + data = pd.date_range(start="2000-06-01", periods=5) + if use_dask: + data = dsa.asarray(data, chunks=(5,)) + # everything should be in the second bin (index 1) + bins = np.array( + [ + np.datetime64("1999-01-01"), + np.datetime64("2000-01-01"), + np.datetime64("2001-01-01"), + ] + ) + h = histogram(data, bins=bins, block_size=block_size) + expected = np.histogram(data, bins=bins)[0] + np.testing.assert_allclose(h, expected) From 67d32f4a01d0927732d47383c6cc3fc44fa2a0ba Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Wed, 16 Jun 2021 16:39:08 +1000 Subject: [PATCH 5/6] switch to searchsorted --- xhistogram/core.py | 19 +++++++++++-------- xhistogram/test/test_core.py | 2 +- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/xhistogram/core.py b/xhistogram/core.py index edf5765..66279f3 100644 --- a/xhistogram/core.py +++ b/xhistogram/core.py @@ -8,7 +8,7 @@ from functools import reduce from collections.abc import Iterable from numpy import ( - digitize, + searchsorted, bincount, reshape, ravel_multi_index, @@ -154,22 +154,25 @@ def _bincount_2d_vectorized( nbins = [len(b) for b in bins] hist_shapes = [nb + 1 for nb in nbins] - # The maximum possible value of digitize is nbins - # For _digitize_inclusive: + # The maximum possible value of searchsorted is nbins + # For _searchsorted_inclusive: # - 0 corresponds to a < b[0] # - i corresponds to b[i-1] <= a < b[i] # - nbins-1 corresponds to b[-2] <= a <= b[-1] # - nbins corresponds to a >= b[-1] - def _digitize_inclusive(a, b): - """Like `digitize`, but where the last bin is also right-edge inclusive.""" + def _searchsorted_inclusive(a, b): + """ + Like `searchsorted`, but where the last bin is also right-edge inclusive. + """ # Similar to implementation in np.histogramdd # see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056 - bin_indices = digitize(a, b) + bin_indices = searchsorted(b, a, side="right") on_edge = a == b[-1] + # Shift these points one bin to the left. bin_indices[on_edge] -= 1 return bin_indices - each_bin_indices = [_digitize_inclusive(a, b) for a, b in zip(args, bins)] + each_bin_indices = [_searchsorted_inclusive(a, b) for a, b in zip(args, bins)] # product of the bins gives the joint distribution if N_inputs > 1: bin_indices = ravel_multi_index(each_bin_indices, hist_shapes) @@ -319,7 +322,7 @@ def histogram( See Also -------- - numpy.histogram, numpy.bincount, numpy.digitize + numpy.histogram, numpy.bincount, numpy.searchsorted """ a0 = args[0] diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py index d615b68..7851c0f 100644 --- a/xhistogram/test/test_core.py +++ b/xhistogram/test/test_core.py @@ -331,6 +331,6 @@ def test_histogram_results_datetime(use_dask, block_size): np.datetime64("2001-01-01"), ] ) - h = histogram(data, bins=bins, block_size=block_size) + h = histogram(data, bins=bins, block_size=block_size)[0] expected = np.histogram(data, bins=bins)[0] np.testing.assert_allclose(h, expected) From 4fdd716ab4e090108a6b99d62fc1c4237834a921 Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Tue, 22 Jun 2021 10:10:37 +1000 Subject: [PATCH 6/6] add note about sorted bins --- xhistogram/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xhistogram/core.py b/xhistogram/core.py index 66279f3..0f66c81 100644 --- a/xhistogram/core.py +++ b/xhistogram/core.py @@ -166,6 +166,7 @@ def _searchsorted_inclusive(a, b): """ # Similar to implementation in np.histogramdd # see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056 + # This assumes the bins (b) are sorted bin_indices = searchsorted(b, a, side="right") on_edge = a == b[-1] # Shift these points one bin to the left.