BUG: Fix unique handling of nan entries. (numpy#18070)

ftrojan · BvB93 · web-flow · commit 7dcd29aaafe1 · 2021-02-12T09:47:55.000-07:00
* benchmark bench_lib.Unique added

* extended test_unique_1d

* modify _unique1d

* extend test with return_index, return_inverse and return_counts parameters

* documentation updated

* Update numpy/lib/arraysetops.py

Co-authored-by: Bas van Beek &lt;43369155+BvB93@users.noreply.github.com&gt;

* full coverage of nan types

Co-authored-by: Bas van Beek &lt;43369155+BvB93@users.noreply.github.com&gt;

* added tests for the datetime like dtypes

* nan as vector of length 1

* use aux[-1] as nan, ..versionchanged, release note

* for complex arrays all NaN values are considered equivalent

Co-authored-by: filip_trojan &lt;Tarantula2018&gt;
Co-authored-by: Bas van Beek &lt;43369155+BvB93@users.noreply.github.com&gt;
diff --git a/benchmarks/benchmarks/bench_lib.py b/benchmarks/benchmarks/bench_lib.py
@@ -53,6 +53,7 @@ def setup(self, shape, pad_width, mode):
     def time_pad(self, shape, pad_width, mode):
         np.pad(self.array, pad_width, mode)
 
+
 class Nan(Benchmark):
     """Benchmarks for nan functions"""
 
@@ -113,3 +114,26 @@ def time_nanquantile(self, array_size, percent_nans):
 
     def time_nanpercentile(self, array_size, percent_nans):
         np.nanpercentile(self.arr, q=50)
+
+
+class Unique(Benchmark):
+    """Benchmark for np.unique with np.nan values."""
+
+    param_names = ["array_size", "percent_nans"]
+    params = [
+        # sizes of the 1D arrays
+        [200, int(2e5)],
+        # percent of np.nan in arrays
+        [0, 0.1, 2., 50., 90.],
+    ]
+
+    def setup(self, array_size, percent_nans):
+        np.random.seed(123)
+        # produce a randomly shuffled array with the
+        # approximate desired percentage np.nan content
+        base_array = np.random.uniform(size=array_size)
+        base_array[base_array < percent_nans / 100.] = np.nan
+        self.arr = base_array
+
+    def time_unique(self, array_size, percent_nans):
+        np.unique(self.arr)
diff --git a/doc/release/upcoming_changes/18070.improvement.rst b/doc/release/upcoming_changes/18070.improvement.rst
@@ -0,0 +1,12 @@
+``np.unique`` now returns single ``NaN``
+----------------------------------------
+When ``np.unique`` operated on an array with multiple ``NaN`` entries,
+its return included a ``NaN`` for each entry that was ``NaN`` in the original array.
+This is now improved such that the returned array contains just one ``NaN`` as the
+last element.
+
+Also for complex arrays all ``NaN`` values are considered equivalent
+(no matter whether the ``NaN`` is in the real or imaginary part). As the
+representant for the returned array the smallest one in the
+lexicographical order is chosen - see ``np.sort`` for how the lexicographical
+order is defined for complex arrays.
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
@@ -209,6 +209,16 @@ def unique(ar, return_index=False, return_inverse=False,
     flattened subarrays are sorted in lexicographic order starting with the
     first element.
 
+    .. versionchanged: NumPy 1.21
+        If nan values are in the input array, a single nan is put
+        to the end of the sorted unique values.
+
+        Also for complex arrays all NaN values are considered equivalent
+        (no matter whether the NaN is in the real or imaginary part).
+        As the representant for the returned array the smallest one in the
+        lexicographical order is chosen - see np.sort for how the lexicographical
+        order is defined for complex arrays.
+
     Examples
     --------
     >>> np.unique([1, 1, 2, 2, 3, 3])
@@ -324,7 +334,16 @@ def _unique1d(ar, return_index=False, return_inverse=False,
         aux = ar
     mask = np.empty(aux.shape, dtype=np.bool_)
     mask[:1] = True
-    mask[1:] = aux[1:] != aux[:-1]
+    if aux.shape[0] > 0 and aux.dtype.kind in "cfmM" and np.isnan(aux[-1]):
+        if aux.dtype.kind == "c":  # for complex all NaNs are considered equivalent
+            aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left')
+        else:
+            aux_firstnan = np.searchsorted(aux, aux[-1], side='left')
+        mask[1:aux_firstnan] = (aux[1:aux_firstnan] != aux[:aux_firstnan - 1])
+        mask[aux_firstnan] = True
+        mask[aux_firstnan + 1:] = False
+    else:
+        mask[1:] = aux[1:] != aux[:-1]
 
     ret = (aux[mask],)
     if return_index:
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
@@ -564,6 +564,52 @@ def check_all(a, b, i1, i2, c, dt):
         assert_equal(a3_idx.dtype, np.intp)
         assert_equal(a3_inv.dtype, np.intp)
 
+        # test for ticket 2111 - float
+        a = [2.0, np.nan, 1.0, np.nan]
+        ua = [1.0, 2.0, np.nan]
+        ua_idx = [2, 0, 1]
+        ua_inv = [1, 2, 0, 2]
+        ua_cnt = [1, 1, 2]
+        assert_equal(np.unique(a), ua)
+        assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
+        assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
+        assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))
+
+        # test for ticket 2111 - complex
+        a = [2.0-1j, np.nan, 1.0+1j, complex(0.0, np.nan), complex(1.0, np.nan)]
+        ua = [1.0+1j, 2.0-1j, complex(0.0, np.nan)]
+        ua_idx = [2, 0, 3]
+        ua_inv = [1, 2, 0, 2, 2]
+        ua_cnt = [1, 1, 3]
+        assert_equal(np.unique(a), ua)
+        assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
+        assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
+        assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))
+
+        # test for ticket 2111 - datetime64
+        nat = np.datetime64('nat')
+        a = [np.datetime64('2020-12-26'), nat, np.datetime64('2020-12-24'), nat]
+        ua = [np.datetime64('2020-12-24'), np.datetime64('2020-12-26'), nat]
+        ua_idx = [2, 0, 1]
+        ua_inv = [1, 2, 0, 2]
+        ua_cnt = [1, 1, 2]
+        assert_equal(np.unique(a), ua)
+        assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
+        assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
+        assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))
+
+        # test for ticket 2111 - timedelta
+        nat = np.timedelta64('nat')
+        a = [np.timedelta64(1, 'D'), nat, np.timedelta64(1, 'h'), nat]
+        ua = [np.timedelta64(1, 'h'), np.timedelta64(1, 'D'), nat]
+        ua_idx = [2, 0, 1]
+        ua_inv = [1, 2, 0, 2]
+        ua_cnt = [1, 1, 2]
+        assert_equal(np.unique(a), ua)
+        assert_equal(np.unique(a, return_index=True), (ua, ua_idx))
+        assert_equal(np.unique(a, return_inverse=True), (ua, ua_inv))
+        assert_equal(np.unique(a, return_counts=True), (ua, ua_cnt))
+
     def test_unique_axis_errors(self):
         assert_raises(TypeError, self._run_axis_tests, object)
         assert_raises(TypeError, self._run_axis_tests,