Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 7dcd29a

Browse files
ftrojanBvB93
andauthoredFeb 12, 2021
BUG: Fix unique handling of nan entries. (numpy#18070)
* benchmark bench_lib.Unique added * extended test_unique_1d * modify _unique1d * extend test with return_index, return_inverse and return_counts parameters * documentation updated * Update numpy/lib/arraysetops.py Co-authored-by: Bas van Beek <43369155+BvB93@users.noreply.github.com> * full coverage of nan types Co-authored-by: Bas van Beek <43369155+BvB93@users.noreply.github.com> * added tests for the datetime like dtypes * nan as vector of length 1 * use aux[-1] as nan, ..versionchanged, release note * for complex arrays all NaN values are considered equivalent Co-authored-by: filip_trojan <Tarantula2018> Co-authored-by: Bas van Beek <43369155+BvB93@users.noreply.github.com>
1 parent a5dc2b5 commit 7dcd29a

File tree

4 files changed

+102
-1
lines changed

4 files changed

+102
-1
lines changed
 

Diff for: ‎benchmarks/benchmarks/bench_lib.py

+24
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def setup(self, shape, pad_width, mode):
5353
def time_pad(self, shape, pad_width, mode):
5454
np.pad(self.array, pad_width, mode)
5555

56+
5657
class Nan(Benchmark):
5758
"""Benchmarks for nan functions"""
5859

@@ -113,3 +114,26 @@ def time_nanquantile(self, array_size, percent_nans):
113114

114115
def time_nanpercentile(self, array_size, percent_nans):
115116
np.nanpercentile(self.arr, q=50)
117+
118+
119+
class Unique(Benchmark):
120+
"""Benchmark for np.unique with np.nan values."""
121+
122+
param_names = ["array_size", "percent_nans"]
123+
params = [
124+
# sizes of the 1D arrays
125+
[200, int(2e5)],
126+
# percent of np.nan in arrays
127+
[0, 0.1, 2., 50., 90.],
128+
]
129+
130+
def setup(self, array_size, percent_nans):
131+
np.random.seed(123)
132+
# produce a randomly shuffled array with the
133+
# approximate desired percentage np.nan content
134+
base_array = np.random.uniform(size=array_size)
135+
base_array[base_array < percent_nans / 100.] = np.nan
136+
self.arr = base_array
137+
138+
def time_unique(self, array_size, percent_nans):
139+
np.unique(self.arr)

Diff for: ‎doc/release/upcoming_changes/18070.improvement.rst

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
``np.unique`` now returns single ``NaN``
2+
----------------------------------------
3+
When ``np.unique`` operated on an array with multiple ``NaN`` entries,
4+
its return included a ``NaN`` for each entry that was ``NaN`` in the original array.
5+
This is now improved such that the returned array contains just one ``NaN`` as the
6+
last element.
7+
8+
Also for complex arrays all ``NaN`` values are considered equivalent
9+
(no matter whether the ``NaN`` is in the real or imaginary part). As the
10+
representant for the returned array the smallest one in the
11+
lexicographical order is chosen - see ``np.sort`` for how the lexicographical
12+
order is defined for complex arrays.

Diff for: ‎numpy/lib/arraysetops.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,16 @@ def unique(ar, return_index=False, return_inverse=False,
209209
flattened subarrays are sorted in lexicographic order starting with the
210210
first element.
211211
212+
.. versionchanged: NumPy 1.21
213+
If nan values are in the input array, a single nan is put
214+
to the end of the sorted unique values.
215+
216+
Also for complex arrays all NaN values are considered equivalent
217+
(no matter whether the NaN is in the real or imaginary part).
218+
As the representant for the returned array the smallest one in the
219+
lexicographical order is chosen - see np.sort for how the lexicographical
220+
order is defined for complex arrays.
221+
212222
Examples
213223
--------
214224
>>> np.unique([1, 1, 2, 2, 3, 3])
@@ -324,7 +334,16 @@ def _unique1d(ar, return_index=False, return_inverse=False,
324334
aux = ar
325335
mask = np.empty(aux.shape, dtype=np.bool_)
326336
mask[:1] = True
327-
mask[1:] = aux[1:] != aux[:-1]
337+
if aux.shape[0] > 0 and aux.dtype.kind in "cfmM" and np.isnan(aux[-1]):
338+
if aux.dtype.kind == "c": # for complex all NaNs are considered equivalent
339+
aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left')
340+
else:
341+
aux_firstnan = np.searchsorted(aux, aux[-1], side='left')
342+
mask[1:aux_firstnan] = (aux[1:aux_firstnan] != aux[:aux_firstnan - 1])
343+
mask[aux_firstnan] = True
344+
mask[aux_firstnan + 1:] = False
345+
else:
346+
mask[1:] = aux[1:] != aux[:-1]
328347

329348
ret = (aux[mask],)
330349
if return_index:

Diff for: ‎numpy/lib/tests/test_arraysetops.py

+46
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,52 @@ def check_all(a, b, i1, i2, c, dt):
564564
assert_equal(a3_idx.dtype, np.intp)
565565
assert_equal(a3_inv.dtype, np.intp)
566566

567+
# test for ticket 2111 - float
568+
a = [2.0, np.nan, 1.0, np.nan]
569+
ua = [1.0, 2.0, np.nan]
570+
ua_idx = [2, 0, 1]
571+
ua_inv = [1, 2, 0, 2]
572+
ua_cnt = [1, 1, 2]
573+
assert_equal(np.unique(a), ua)
574+
assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
575+
assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
576+
assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))
577+
578+
# test for ticket 2111 - complex
579+
a = [2.0-1j, np.nan, 1.0+1j, complex(0.0, np.nan), complex(1.0, np.nan)]
580+
ua = [1.0+1j, 2.0-1j, complex(0.0, np.nan)]
581+
ua_idx = [2, 0, 3]
582+
ua_inv = [1, 2, 0, 2, 2]
583+
ua_cnt = [1, 1, 3]
584+
assert_equal(np.unique(a), ua)
585+
assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
586+
assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
587+
assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))
588+
589+
# test for ticket 2111 - datetime64
590+
nat = np.datetime64('nat')
591+
a = [np.datetime64('2020-12-26'), nat, np.datetime64('2020-12-24'), nat]
592+
ua = [np.datetime64('2020-12-24'), np.datetime64('2020-12-26'), nat]
593+
ua_idx = [2, 0, 1]
594+
ua_inv = [1, 2, 0, 2]
595+
ua_cnt = [1, 1, 2]
596+
assert_equal(np.unique(a), ua)
597+
assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
598+
assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
599+
assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))
600+
601+
# test for ticket 2111 - timedelta
602+
nat = np.timedelta64('nat')
603+
a = [np.timedelta64(1, 'D'), nat, np.timedelta64(1, 'h'), nat]
604+
ua = [np.timedelta64(1, 'h'), np.timedelta64(1, 'D'), nat]
605+
ua_idx = [2, 0, 1]
606+
ua_inv = [1, 2, 0, 2]
607+
ua_cnt = [1, 1, 2]
608+
assert_equal(np.unique(a), ua)
609+
assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
610+
assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
611+
assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))
612+
567613
def test_unique_axis_errors(self):
568614
assert_raises(TypeError, self._run_axis_tests, object)
569615
assert_raises(TypeError, self._run_axis_tests,

0 commit comments

Comments
 (0)
Please sign in to comment.