From ce0efe86c5a3c261b91ad36a4b550816d7393994 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Sun, 29 Nov 2020 16:59:10 +0100 Subject: [PATCH 01/15] BUG: Series.at returning Series with one element instead of scalar (#38101) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/multi.py | 4 ++ pandas/tests/indexing/test_scalar.py | 70 +++++++++++++++------------- 3 files changed, 43 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6aff4f4bd41e2..f53cde7fac068 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -631,6 +631,7 @@ Indexing - Bug in :meth:`MultiIndex.drop` does not raise if labels are partially found (:issue:`37820`) - Bug in :meth:`DataFrame.loc` did not raise ``KeyError`` when missing combination was given with ``slice(None)`` for remaining levels (:issue:`19556`) - Bug in :meth:`DataFrame.loc` raising ``TypeError`` when non-integer slice was given to select values from :class:`MultiIndex` (:issue:`25165`, :issue:`24263`) +- Bug in :meth:`Series.at` returning :class:`Series` with one element instead of scalar when index is a :class:`MultiIndex` with one level (:issue:`38053`) - Bug in :meth:`DataFrame.loc` returning and assigning elements in wrong order when indexer is differently ordered than the :class:`MultiIndex` to filter (:issue:`31330`, :issue:`34603`) - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.__getitem__` raising ``KeyError`` when columns were :class:`MultiIndex` with only one level (:issue:`29749`) - Bug in :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` raising blank ``KeyError`` without missing keys for :class:`IntervalIndex` (:issue:`27365`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9b4b459d9a122..dacd802b21e63 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2530,6 +2530,10 @@ def _get_values_for_loc(self, series: "Series", loc, key): if is_scalar(loc): return new_values + if len(new_values) == 1 and not self.nlevels > 1: + # If more than one level left, we can not return a scalar + return new_values[0] + new_index = self[loc] new_index = maybe_droplevels(new_index, key) new_ser = series._constructor(new_values, index=new_index, name=series.name) diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index dd01f4e6a4f49..ce48fd1e5c905 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -268,35 +268,41 @@ def test_at_with_tuple_index_set(): assert series.at[1, 2] == 3 -def test_multiindex_at_get(): - # GH 26989 - # DataFrame.at and DataFrame.loc getter works with MultiIndex - df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]]) - assert df.index.nlevels == 2 - assert df.at[(1, 3), "a"] == 1 - assert df.loc[(1, 3), "a"] == 1 - - # Series.at and Series.loc getter works with MultiIndex - series = df["a"] - assert series.index.nlevels == 2 - assert series.at[1, 3] == 1 - assert series.loc[1, 3] == 1 - - -def test_multiindex_at_set(): - # GH 26989 - # DataFrame.at and DataFrame.loc setter works with MultiIndex - df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]]) - assert df.index.nlevels == 2 - df.at[(1, 3), "a"] = 3 - assert df.at[(1, 3), "a"] == 3 - df.loc[(1, 3), "a"] = 4 - assert df.loc[(1, 3), "a"] == 4 - - # Series.at and Series.loc setter works with MultiIndex - series = df["a"] - assert series.index.nlevels == 2 - series.at[1, 3] = 5 - assert series.at[1, 3] == 5 - series.loc[1, 3] = 6 - assert series.loc[1, 3] == 6 +class TestMultiIndexScalar: + def test_multiindex_at_get(self): + # GH 26989 + # DataFrame.at and DataFrame.loc getter works with MultiIndex + df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]]) + assert df.index.nlevels == 2 + assert df.at[(1, 3), "a"] == 1 + assert df.loc[(1, 3), "a"] == 1 + + # Series.at and Series.loc getter works with MultiIndex + series = df["a"] + assert series.index.nlevels == 2 + assert series.at[1, 3] == 1 + assert series.loc[1, 3] == 1 + + def test_multiindex_at_set(self): + # GH 26989 + # DataFrame.at and DataFrame.loc setter works with MultiIndex + df = DataFrame({"a": [1, 2]}, index=[[1, 2], [3, 4]]) + assert df.index.nlevels == 2 + df.at[(1, 3), "a"] = 3 + assert df.at[(1, 3), "a"] == 3 + df.loc[(1, 3), "a"] = 4 + assert df.loc[(1, 3), "a"] == 4 + + # Series.at and Series.loc setter works with MultiIndex + series = df["a"] + assert series.index.nlevels == 2 + series.at[1, 3] = 5 + assert series.at[1, 3] == 5 + series.loc[1, 3] = 6 + assert series.loc[1, 3] == 6 + + def test_multiindex_at_get_one_level(self): + # GH#38053 + s2 = Series((0, 1), index=[[False, True]]) + result = s2.at[False] + assert result == 0 From 22007d3fd294033132be4fffb0fbdd10c8dd46de Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 29 Nov 2020 07:59:56 -0800 Subject: [PATCH 02/15] API: CategoricalIndex.append fallback to concat_compat (#38098) --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/core/indexes/base.py | 6 ---- pandas/core/indexes/category.py | 20 +++++++----- pandas/core/indexes/multi.py | 4 --- pandas/core/reshape/pivot.py | 31 +++++-------------- .../indexes/categorical/test_category.py | 8 ++--- pandas/tests/indexing/test_categorical.py | 9 ++++-- .../tests/reshape/concat/test_categorical.py | 14 ++++++--- 8 files changed, 41 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f53cde7fac068..501e2878ab135 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -454,6 +454,7 @@ Other API changes - Passing an invalid ``fill_value`` to :meth:`Series.shift` with a ``CategoricalDtype`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) - Passing an invalid value to :meth:`IntervalIndex.insert` or :meth:`CategoricalIndex.insert` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) - Attempting to reindex a Series with a :class:`CategoricalIndex` with an invalid ``fill_value`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) +- :meth:`CategoricalIndex.append` with an index that contains non-category values will now cast instead of raising ``TypeError`` (:issue:`38098`) .. --------------------------------------------------------------------------- @@ -635,6 +636,7 @@ Indexing - Bug in :meth:`DataFrame.loc` returning and assigning elements in wrong order when indexer is differently ordered than the :class:`MultiIndex` to filter (:issue:`31330`, :issue:`34603`) - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.__getitem__` raising ``KeyError`` when columns were :class:`MultiIndex` with only one level (:issue:`29749`) - Bug in :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` raising blank ``KeyError`` without missing keys for :class:`IntervalIndex` (:issue:`27365`) +- Bug in setting a new label on a :class:`DataFrame` or :class:`Series` with a :class:`CategoricalIndex` incorrectly raising ``TypeError`` when the new label is not among the index's categories (:issue:`38098`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c49f3f9457161..c86652acbcd0f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4180,12 +4180,6 @@ def _coerce_scalar_to_index(self, item): return Index([item], dtype=dtype, **self._get_attributes_dict()) - def _to_safe_for_reshape(self): - """ - Convert to object if we are a categorical. - """ - return self - def _validate_fill_value(self, value): """ Check if the value can be inserted into our array, and convert diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 7956b3a623333..abf70fd150345 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -399,10 +399,6 @@ def unique(self, level=None): # of result, not self. return type(self)._simple_new(result, name=self.name) - def _to_safe_for_reshape(self): - """ convert to object if we are a categorical """ - return self.astype("object") - def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -637,11 +633,19 @@ def map(self, mapper): mapped = self._values.map(mapper) return Index(mapped, name=self.name) - def _concat(self, to_concat: List["Index"], name: Label) -> "CategoricalIndex": + def _concat(self, to_concat: List["Index"], name: Label) -> Index: # if calling index is category, don't check dtype of others - codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) - cat = self._data._from_backing_data(codes) - return type(self)._simple_new(cat, name=name) + try: + codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) + except TypeError: + # not all to_concat elements are among our categories (or NA) + from pandas.core.dtypes.concat import concat_compat + + res = concat_compat(to_concat) + return Index(res, name=name) + else: + cat = self._data._from_backing_data(codes) + return type(self)._simple_new(cat, name=name) def _delegate_method(self, name: str, *args, **kwargs): """ method delegation to the ._values """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dacd802b21e63..46846209f315b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1684,10 +1684,6 @@ def unique(self, level=None): level = self._get_level_number(level) return self._get_level_values(level=level, unique=True) - def _to_safe_for_reshape(self): - """ convert to object if we are a categorical """ - return self.set_levels([i._to_safe_for_reshape() for i in self.levels]) - def to_frame(self, index=True, name=None): """ Create a DataFrame with the levels of the MultiIndex as columns. diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 22887cede51ed..40496a5b8671b 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -268,19 +268,13 @@ def _add_margins( margin_dummy = DataFrame(row_margin, columns=[key]).T row_names = result.index.names - try: - # check the result column and leave floats - for dtype in set(result.dtypes): - cols = result.select_dtypes([dtype]).columns - margin_dummy[cols] = margin_dummy[cols].apply( - maybe_downcast_to_dtype, args=(dtype,) - ) - result = result.append(margin_dummy) - except TypeError: - - # we cannot reshape, so coerce the axis - result.index = result.index._to_safe_for_reshape() - result = result.append(margin_dummy) + # check the result column and leave floats + for dtype in set(result.dtypes): + cols = result.select_dtypes([dtype]).columns + margin_dummy[cols] = margin_dummy[cols].apply( + maybe_downcast_to_dtype, args=(dtype,) + ) + result = result.append(margin_dummy) result.index.names = row_names return result @@ -328,16 +322,7 @@ def _all_key(key): # we are going to mutate this, so need to copy! piece = piece.copy() - try: - piece[all_key] = margin[key] - except ValueError: - # we cannot reshape, so coerce the axis - piece.set_axis( - piece._get_axis(cat_axis)._to_safe_for_reshape(), - axis=cat_axis, - inplace=True, - ) - piece[all_key] = margin[key] + piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 2e03c00638a5c..3bab57e1d265e 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -57,10 +57,10 @@ def test_append(self): expected = CategoricalIndex(list("aabbcaca"), categories=categories) tm.assert_index_equal(result, expected, exact=True) - # invalid objects - msg = "cannot append a non-category item to a CategoricalIndex" - with pytest.raises(TypeError, match=msg): - ci.append(Index(["a", "d"])) + # invalid objects -> cast to object via concat_compat + result = ci.append(Index(["a", "d"])) + expected = Index(["a", "a", "b", "b", "c", "a", "a", "d"]) + tm.assert_index_equal(result, expected, exact=True) # GH14298 - if base object is not categorical -> coerce to object result = Index(["c", "a"]).append(ci) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 6fff706e27cd2..1b9b6452b2e33 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -57,9 +57,12 @@ def test_loc_scalar(self): with pytest.raises(KeyError, match=r"^'d'$"): df.loc["d"] - msg = "cannot append a non-category item to a CategoricalIndex" - with pytest.raises(TypeError, match=msg): - df.loc["d"] = 10 + df2 = df.copy() + expected = df2.copy() + expected.index = expected.index.astype(object) + expected.loc["d"] = 10 + df2.loc["d"] = 10 + tm.assert_frame_equal(df2, expected) msg = "'fill_value=d' is not present in this Categorical's categories" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index 388575c5a3b86..6dae28003d3b6 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -1,5 +1,4 @@ import numpy as np -import pytest from pandas.core.dtypes.dtypes import CategoricalDtype @@ -137,13 +136,18 @@ def test_categorical_index_preserver(self): ).set_index("B") tm.assert_frame_equal(result, expected) - # wrong categories + # wrong categories -> uses concat_compat, which casts to object df3 = DataFrame( {"A": a, "B": Categorical(b, categories=list("abe"))} ).set_index("B") - msg = "categories must match existing categories when appending" - with pytest.raises(TypeError, match=msg): - pd.concat([df2, df3]) + result = pd.concat([df2, df3]) + expected = pd.concat( + [ + df2.set_axis(df2.index.astype(object), 0), + df3.set_axis(df3.index.astype(object), 0), + ] + ) + tm.assert_frame_equal(result, expected) def test_concat_categorical_tz(self): # GH-23816 From 8eca4b76bd3748c201a680a7b66f7dcd84b49315 Mon Sep 17 00:00:00 2001 From: mlondschien <61679398+mlondschien@users.noreply.github.com> Date: Sun, 29 Nov 2020 17:06:28 +0100 Subject: [PATCH 03/15] BUG: fix astype conversion string -> float (#37974) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/conftest.py | 15 ++++++++++++++- pandas/core/arrays/string_.py | 16 +++++++++++++++- pandas/tests/arrays/string_/test_string.py | 9 +++++++++ 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 501e2878ab135..7c07601352bce 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -589,6 +589,7 @@ Conversion ^^^^^^^^^^ - Bug in :meth:`DataFrame.to_dict` with ``orient='records'`` now returns python native datetime objects for datetimelike columns (:issue:`21256`) +- Bug in :meth:`Series.astype` conversion from ``string`` to ``float`` raised in presence of ``pd.NA`` values (:issue:`37626`) - Strings diff --git a/pandas/conftest.py b/pandas/conftest.py index a0ec6f96042fc..3d9d2ba04f31b 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -288,7 +288,6 @@ def unique_nulls_fixture(request): # Generate cartesian product of unique_nulls_fixture: unique_nulls_fixture2 = unique_nulls_fixture - # ---------------------------------------------------------------- # Classes # ---------------------------------------------------------------- @@ -1091,6 +1090,20 @@ def float_ea_dtype(request): return request.param +@pytest.fixture(params=tm.FLOAT_DTYPES + tm.FLOAT_EA_DTYPES) +def any_float_allowed_nullable_dtype(request): + """ + Parameterized fixture for float dtypes. + + * float + * 'float32' + * 'float64' + * 'Float32' + * 'Float64' + """ + return request.param + + @pytest.fixture(params=tm.COMPLEX_DTYPES) def complex_dtype(request): """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e75305e55348c..cc2013deb5252 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -18,7 +18,8 @@ from pandas.core import ops from pandas.core.array_algos import masked_reductions -from pandas.core.arrays import IntegerArray, PandasArray +from pandas.core.arrays import FloatingArray, IntegerArray, PandasArray +from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer @@ -294,6 +295,19 @@ def astype(self, dtype, copy=True): arr[mask] = 0 values = arr.astype(dtype.numpy_dtype) return IntegerArray(values, mask, copy=False) + elif isinstance(dtype, FloatingDtype): + arr = self.copy() + mask = self.isna() + arr[mask] = "0" + values = arr.astype(dtype.numpy_dtype) + return FloatingArray(values, mask, copy=False) + elif np.issubdtype(dtype, np.floating): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = 0 + values = arr.astype(dtype) + values[mask] = np.nan + return values return super().astype(dtype, copy) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 9a1634380aaba..e35a632734779 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -366,6 +366,15 @@ def test_astype_int(dtype, request): tm.assert_extension_array_equal(result, expected) +def test_astype_float(any_float_allowed_nullable_dtype): + # Don't compare arrays (37974) + ser = pd.Series(["1.1", pd.NA, "3.3"], dtype="string") + + result = ser.astype(any_float_allowed_nullable_dtype) + expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_allowed_nullable_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce(skipna, dtype): From 7b400b3428c138e9f324a33e92027d41857bbeb1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 29 Nov 2020 17:10:18 +0100 Subject: [PATCH 04/15] ENH: add use_nullable_dtypes option in read_parquet (#31242) --- doc/source/whatsnew/v1.2.0.rst | 4 +++ pandas/io/parquet.py | 61 ++++++++++++++++++++++++++++++--- pandas/tests/io/test_parquet.py | 37 ++++++++++++++++++++ 3 files changed, 98 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7c07601352bce..c9347b88f2072 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -241,6 +241,10 @@ Other enhancements - Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`). - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) +- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use + nullable dtypes that use ``pd.NA`` as missing value indicator where possible + for the resulting DataFrame (default is False, and only applicable for + ``engine="pyarrow"``) (:issue:`31242`) - Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`) - :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`) - :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a19b132a7891d..8b1184df92eaf 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,5 +1,6 @@ """ parquet compat """ +from distutils.version import LooseVersion import io import os from typing import Any, AnyStr, Dict, List, Optional, Tuple @@ -177,10 +178,39 @@ def write( handles.close() def read( - self, path, columns=None, storage_options: StorageOptions = None, **kwargs + self, + path, + columns=None, + use_nullable_dtypes=False, + storage_options: StorageOptions = None, + **kwargs, ): kwargs["use_pandas_metadata"] = True + to_pandas_kwargs = {} + if use_nullable_dtypes: + if LooseVersion(self.api.__version__) >= "0.16": + import pandas as pd + + mapping = { + self.api.int8(): pd.Int8Dtype(), + self.api.int16(): pd.Int16Dtype(), + self.api.int32(): pd.Int32Dtype(), + self.api.int64(): pd.Int64Dtype(), + self.api.uint8(): pd.UInt8Dtype(), + self.api.uint16(): pd.UInt16Dtype(), + self.api.uint32(): pd.UInt32Dtype(), + self.api.uint64(): pd.UInt64Dtype(), + self.api.bool_(): pd.BooleanDtype(), + self.api.string(): pd.StringDtype(), + } + to_pandas_kwargs["types_mapper"] = mapping.get + else: + raise ValueError( + "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " + f"({self.api.__version__} is installed" + ) + path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), @@ -190,7 +220,7 @@ def read( try: return self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs - ).to_pandas() + ).to_pandas(**to_pandas_kwargs) finally: if handles is not None: handles.close() @@ -258,6 +288,12 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): + use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) + if use_nullable_dtypes: + raise ValueError( + "The 'use_nullable_dtypes' argument is not supported for the " + "fastparquet engine" + ) path = stringify_path(path) parquet_kwargs = {} handles = None @@ -368,7 +404,13 @@ def to_parquet( return None -def read_parquet(path, engine: str = "auto", columns=None, **kwargs): +def read_parquet( + path, + engine: str = "auto", + columns=None, + use_nullable_dtypes: bool = False, + **kwargs, +): """ Load a parquet object from the file path, returning a DataFrame. @@ -397,6 +439,15 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): 'pyarrow' is unavailable. columns : list, default=None If not None, only these columns will be read from the file. + use_nullable_dtypes : bool, default False + If True, use dtypes that use ``pd.NA`` as missing value indicator + for the resulting DataFrame (only applicable for ``engine="pyarrow"``). + As new dtypes are added that support ``pd.NA`` in the future, the + output with this option will change to use those dtypes. + Note: this is an experimental option, and behaviour (e.g. additional + support dtypes) may change without notice. + + .. versionadded:: 1.2.0 **kwargs Any additional kwargs are passed to the engine. @@ -405,4 +456,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): DataFrame """ impl = get_engine(engine) - return impl.read(path, columns=columns, **kwargs) + return impl.read( + path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs + ) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 3b83eed69c723..7e1d7fb17c8ed 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -828,6 +828,35 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) + @td.skip_if_no("pyarrow", min_version="0.16") + def test_use_nullable_dtypes(self, pa): + import pyarrow.parquet as pq + + table = pyarrow.table( + { + "a": pyarrow.array([1, 2, 3, None], "int64"), + "b": pyarrow.array([1, 2, 3, None], "uint8"), + "c": pyarrow.array(["a", "b", "c", None]), + "d": pyarrow.array([True, False, True, None]), + } + ) + with tm.ensure_clean() as path: + # write manually with pyarrow to write integers + pq.write_table(table, path) + result1 = read_parquet(path) + result2 = read_parquet(path, use_nullable_dtypes=True) + + assert result1["a"].dtype == np.dtype("float64") + expected = pd.DataFrame( + { + "a": pd.array([1, 2, 3, None], dtype="Int64"), + "b": pd.array([1, 2, 3, None], dtype="UInt8"), + "c": pd.array(["a", "b", "c", None], dtype="string"), + "d": pd.array([True, False, True, None], dtype="boolean"), + } + ) + tm.assert_frame_equal(result2, expected) + @td.skip_if_no("pyarrow", min_version="0.14") def test_timestamp_nanoseconds(self, pa): # with version 2.0, pyarrow defaults to writing the nanoseconds, so @@ -1001,3 +1030,11 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list): expected = df.copy() expected.index.name = "index" check_round_trip(df, fp, expected=expected) + + def test_use_nullable_dtypes_not_supported(self, fp): + df = pd.DataFrame({"a": [1, 2]}) + + with tm.ensure_clean() as path: + df.to_parquet(path) + with pytest.raises(ValueError, match="not supported for the fastparquet"): + read_parquet(path, engine="fastparquet", use_nullable_dtypes=True) From e99e5ab32c4e831e7bbac0346189f4d6d86a6225 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Sun, 29 Nov 2020 18:21:52 +0100 Subject: [PATCH 05/15] BUG: Fix duplicates in intersection of multiindexes (#36927) --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/indexes/base.py | 9 +++++--- pandas/core/indexes/multi.py | 8 +++++-- pandas/core/ops/__init__.py | 5 +++- pandas/core/reshape/merge.py | 9 ++++++-- .../tests/indexes/base_class/test_setops.py | 2 +- pandas/tests/indexes/multi/test_setops.py | 23 +++++++++++++++++++ pandas/tests/indexes/test_setops.py | 10 ++++++++ pandas/tests/reshape/merge/test_merge.py | 2 +- 9 files changed, 59 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 46c4ad4f35fe4..edc2f7327abfc 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -23,6 +23,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) - Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). +- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c86652acbcd0f..3f89b0619e600 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2822,7 +2822,7 @@ def intersection(self, other, sort=False): self._assert_can_do_setop(other) other = ensure_index(other) - if self.equals(other): + if self.equals(other) and not self.has_duplicates: return self._get_reconciled_name_object(other) if not is_dtype_equal(self.dtype, other.dtype): @@ -2847,7 +2847,7 @@ def _intersection(self, other, sort=False): except TypeError: pass else: - return result + return algos.unique1d(result) try: indexer = Index(rvals).get_indexer(lvals) @@ -2858,11 +2858,14 @@ def _intersection(self, other, sort=False): indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] - result = other.take(indexer)._values + result = other.take(indexer).unique()._values if sort is None: result = algos.safe_sort(result) + # Intersection has to be unique + assert algos.unique(result).shape == result.shape + return result def difference(self, other, sort=None): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 46846209f315b..589da4a6c4ceb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3601,6 +3601,8 @@ def intersection(self, other, sort=False): other, result_names = self._convert_can_do_setop(other) if self.equals(other): + if self.has_duplicates: + return self.unique().rename(result_names) return self.rename(result_names) if not is_object_dtype(other.dtype): @@ -3619,10 +3621,12 @@ def intersection(self, other, sort=False): uniq_tuples = None # flag whether _inner_indexer was successful if self.is_monotonic and other.is_monotonic: try: - uniq_tuples = self._inner_indexer(lvals, rvals)[0] - sort = False # uniq_tuples is already sorted + inner_tuples = self._inner_indexer(lvals, rvals)[0] + sort = False # inner_tuples is already sorted except TypeError: pass + else: + uniq_tuples = algos.unique(inner_tuples) if uniq_tuples is None: other_uniq = set(rvals) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 2b159c607b0a0..d8b5dba424cbf 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -311,7 +311,10 @@ def should_reindex_frame_op( # TODO: any other cases we should handle here? cols = left.columns.intersection(right.columns) - if len(cols) and not (cols.equals(left.columns) and cols.equals(right.columns)): + # Intersection is always unique so we have to check the unique columns + left_uniques = left.columns.unique() + right_uniques = right.columns.unique() + if len(cols) and not (cols.equals(left_uniques) and cols.equals(right_uniques)): # TODO: is there a shortcut available when len(cols) == 0? return True diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3b755c40721fb..9bb1add309407 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1271,7 +1271,9 @@ def _validate_specification(self): raise MergeError("Must pass left_on or left_index=True") else: # use the common columns - common_cols = self.left.columns.intersection(self.right.columns) + left_cols = self.left.columns + right_cols = self.right.columns + common_cols = left_cols.intersection(right_cols) if len(common_cols) == 0: raise MergeError( "No common columns to perform merge on. " @@ -1280,7 +1282,10 @@ def _validate_specification(self): f"left_index={self.left_index}, " f"right_index={self.right_index}" ) - if not common_cols.is_unique: + if ( + not left_cols.join(common_cols, how="inner").is_unique + or not right_cols.join(common_cols, how="inner").is_unique + ): raise MergeError(f"Data columns not unique: {repr(common_cols)}") self.left_on = self.right_on = common_cols elif self.on is not None: diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 6413b110dff2e..ddcb3c5b87ebc 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -141,7 +141,7 @@ def test_intersection_str_dates(self, sort): @pytest.mark.parametrize( "index2,expected_arr", - [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B", "A"])], + [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B"])], ) def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 4ac9a27069a3f..2ac57f1befd57 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -378,3 +378,26 @@ def test_setops_disallow_true(method): with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) + + +@pytest.mark.parametrize( + ("tuples", "exp_tuples"), + [ + ([("val1", "test1")], [("val1", "test1")]), + ([("val1", "test1"), ("val1", "test1")], [("val1", "test1")]), + ( + [("val2", "test2"), ("val1", "test1")], + [("val2", "test2"), ("val1", "test1")], + ), + ], +) +def test_intersect_with_duplicates(tuples, exp_tuples): + # GH#36915 + left = MultiIndex.from_tuples(tuples, names=["first", "second"]) + right = MultiIndex.from_tuples( + [("val1", "test1"), ("val1", "test1"), ("val2", "test2")], + names=["first", "second"], + ) + result = left.intersection(right) + expected = MultiIndex.from_tuples(exp_tuples, names=["first", "second"]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 0973cef7cfdc1..2675c4569a8e9 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -120,6 +120,16 @@ def test_dunder_inplace_setops_deprecated(index): index ^= index +@pytest.mark.parametrize("values", [[1, 2, 2, 3], [3, 3]]) +def test_intersection_duplicates(values): + # GH#31326 + a = pd.Index(values) + b = pd.Index([3, 3]) + result = a.intersection(b) + expected = pd.Index([3]) + tm.assert_index_equal(result, expected) + + class TestSetOps: # Set operation tests shared by all indexes in the `index` fixture @pytest.mark.parametrize("case", [0.5, "xxx"]) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f44909b61ff7a..40ba62a27aa68 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -753,7 +753,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo', 'foo'\], dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) From 7070aae40b7bef2ee65ee0725ad9979b11a54704 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 30 Nov 2020 01:45:40 +0800 Subject: [PATCH 06/15] BUG: merge_ordered fails with list-like left_by or right_by (#38089) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/reshape/merge.py | 4 +- .../tests/reshape/merge/test_merge_ordered.py | 62 +++++++++++++++++++ 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index c9347b88f2072..fe9c067b4d2ea 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -754,6 +754,7 @@ Reshaping - Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`) - Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`) - Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`) +- Bug in :func:`merge_ordered` couldn't handle list-like ``left_by`` or ``right_by`` (:issue:`35269`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9bb1add309407..545117dd84f93 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -140,9 +140,7 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec # make sure join keys are in the merged # TODO, should merge_pieces do this? - for k in by: - if k in merged: - merged[k] = key + merged[by] = key pieces.append(merged) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 17f2f44f45fce..8389a6bb9be10 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -115,3 +115,65 @@ def test_doc_example(self): ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "left, right, on, left_by, right_by, expected", + [ + ( + DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}), + DataFrame({"T": [2], "E": [1]}), + ["T"], + ["G", "H"], + None, + DataFrame( + { + "G": ["g"] * 3, + "H": ["h"] * 3, + "T": [1, 2, 3], + "E": [np.nan, 1.0, np.nan], + } + ), + ), + ( + DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}), + DataFrame({"T": [2], "E": [1]}), + "T", + ["G", "H"], + None, + DataFrame( + { + "G": ["g"] * 3, + "H": ["h"] * 3, + "T": [1, 2, 3], + "E": [np.nan, 1.0, np.nan], + } + ), + ), + ( + DataFrame({"T": [2], "E": [1]}), + DataFrame({"G": ["g", "g"], "H": ["h", "h"], "T": [1, 3]}), + ["T"], + None, + ["G", "H"], + DataFrame( + { + "T": [1, 2, 3], + "E": [np.nan, 1.0, np.nan], + "G": ["g"] * 3, + "H": ["h"] * 3, + } + ), + ), + ], + ) + def test_list_type_by(self, left, right, on, left_by, right_by, expected): + # GH 35269 + result = merge_ordered( + left=left, + right=right, + on=on, + left_by=left_by, + right_by=right_by, + ) + + tm.assert_frame_equal(result, expected) From eaa45cf7b1135dd5c9d1fe93717594566e55ecc9 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Mon, 30 Nov 2020 02:06:23 +0800 Subject: [PATCH 07/15] DOC: Add behavior for Index argument in DataFrame.loc (#38109) --- pandas/core/indexing.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6aa031af64833..f6cf691ea911c 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -259,10 +259,11 @@ def loc(self) -> "_LocIndexer": e.g. ``[True, False, True]``. - An alignable boolean Series. The index of the key will be aligned before masking. + - An alignable Index. The Index of the returned selection will be the input. - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above) - See more at :ref:`Selection by Label ` + See more at :ref:`Selection by Label `. Raises ------ @@ -332,6 +333,14 @@ def loc(self) -> "_LocIndexer": max_speed shield sidewinder 7 8 + Index (same behavior as ``df.reindex``) + + >>> df.loc[pd.Index(["cobra", "viper"], name="foo")] + max_speed shield + foo + cobra 1 2 + viper 4 5 + Conditional that returns a boolean Series >>> df.loc[df['shield'] > 6] From 224d2e88b349f3e8fc56104a4d28ddb5b08dfdd6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 29 Nov 2020 10:15:12 -0800 Subject: [PATCH 08/15] REF: de-duplicate ndarray[datetimelike] wrapping (#38129) --- pandas/core/arrays/interval.py | 12 +++++++----- pandas/core/construction.py | 18 ++++++++++++++++++ pandas/core/dtypes/concat.py | 22 +++++----------------- pandas/core/ops/array_ops.py | 34 +++++----------------------------- 4 files changed, 35 insertions(+), 51 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index efb66c9a47a97..757cea2c710b2 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -44,7 +44,11 @@ from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com -from pandas.core.construction import array, extract_array +from pandas.core.construction import ( + array, + ensure_wrapped_if_datetimelike, + extract_array, +) from pandas.core.indexers import check_array_indexer from pandas.core.indexes.base import ensure_index from pandas.core.ops import invalid_comparison, unpack_zerodim_and_defer @@ -251,11 +255,9 @@ def _simple_new( raise ValueError(msg) # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray - from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array - - left = maybe_upcast_datetimelike_array(left) + left = ensure_wrapped_if_datetimelike(left) left = extract_array(left, extract_numpy=True) - right = maybe_upcast_datetimelike_array(right) + right = ensure_wrapped_if_datetimelike(right) right = extract_array(right, extract_numpy=True) lbase = getattr(left, "_ndarray", left).base diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f9ebe3f1e185e..96cf1be7520fb 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -402,6 +402,24 @@ def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayL return obj +def ensure_wrapped_if_datetimelike(arr): + """ + Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray. + """ + if isinstance(arr, np.ndarray): + if arr.dtype.kind == "M": + from pandas.core.arrays import DatetimeArray + + return DatetimeArray._from_sequence(arr) + + elif arr.dtype.kind == "m": + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(arr) + + return arr + + def sanitize_array( data, index: Optional[Index], diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 63e3440558c75..a9355e30cd3c2 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -18,7 +18,7 @@ from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray -from pandas.core.construction import array +from pandas.core.construction import array, ensure_wrapped_if_datetimelike def _get_dtype_kinds(arrays) -> Set[str]: @@ -360,12 +360,14 @@ def _concat_datetime(to_concat, axis=0): ------- a single array, preserving the combined dtypes """ - to_concat = [_wrap_datetimelike(x) for x in to_concat] + to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] + single_dtype = len({x.dtype for x in to_concat}) == 1 # multiple types, need to coerce to object if not single_dtype: - # wrap_datetimelike ensures that astype(object) wraps in Timestamp/Timedelta + # ensure_wrapped_if_datetimelike ensures that astype(object) wraps + # in Timestamp/Timedelta return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) if axis == 1: @@ -379,17 +381,3 @@ def _concat_datetime(to_concat, axis=0): assert result.shape[0] == 1 result = result[0] return result - - -def _wrap_datetimelike(arr): - """ - Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray. - - DTA/TDA handle .astype(object) correctly. - """ - from pandas.core.construction import array as pd_array, extract_array - - arr = extract_array(arr, extract_numpy=True) - if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]: - arr = pd_array(arr) - return arr diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index c855687552e82..41d539564d91e 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -30,6 +30,7 @@ from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna +from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.ops import missing from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison @@ -175,8 +176,8 @@ def arithmetic_op(left: ArrayLike, right: Any, op): # NB: We assume that extract_array has already been called # on `left` and `right`. - lvalues = maybe_upcast_datetimelike_array(left) - rvalues = maybe_upcast_datetimelike_array(right) + lvalues = ensure_wrapped_if_datetimelike(left) + rvalues = ensure_wrapped_if_datetimelike(right) rvalues = _maybe_upcast_for_op(rvalues, lvalues.shape) if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta): @@ -206,7 +207,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: ndarray or ExtensionArray """ # NB: We assume extract_array has already been called on left and right - lvalues = maybe_upcast_datetimelike_array(left) + lvalues = ensure_wrapped_if_datetimelike(left) rvalues = right rvalues = lib.item_from_zerodim(rvalues) @@ -331,7 +332,7 @@ def fill_bool(x, left=None): right = construct_1d_object_array_from_listlike(right) # NB: We assume extract_array has already been called on left and right - lvalues = maybe_upcast_datetimelike_array(left) + lvalues = ensure_wrapped_if_datetimelike(left) rvalues = right if should_extension_dispatch(lvalues, rvalues): @@ -400,31 +401,6 @@ def get_array_op(op): raise NotImplementedError(op_name) -def maybe_upcast_datetimelike_array(obj: ArrayLike) -> ArrayLike: - """ - If we have an ndarray that is either datetime64 or timedelta64, wrap in EA. - - Parameters - ---------- - obj : ndarray or ExtensionArray - - Returns - ------- - ndarray or ExtensionArray - """ - if isinstance(obj, np.ndarray): - if obj.dtype.kind == "m": - from pandas.core.arrays import TimedeltaArray - - return TimedeltaArray._from_sequence(obj) - if obj.dtype.kind == "M": - from pandas.core.arrays import DatetimeArray - - return DatetimeArray._from_sequence(obj) - - return obj - - def _maybe_upcast_for_op(obj, shape: Shape): """ Cast non-pandas objects to pandas types to unify behavior of arithmetic From 1cb5f69c6f07808c0df9a96b5a5679a8308ffae9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 29 Nov 2020 10:15:59 -0800 Subject: [PATCH 09/15] BUG: Index.intersection casting to object instead of numeric (#38122) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/base.py | 6 +++-- pandas/core/indexes/multi.py | 10 +++---- pandas/tests/indexes/multi/test_setops.py | 20 +++++++++++++- pandas/tests/indexes/ranges/test_setops.py | 31 +++++++++++++++++++++- 5 files changed, 58 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index fe9c067b4d2ea..3b1d1b4f241b5 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -782,6 +782,7 @@ Other - Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`, :issue:`37381`) - Fixed metadata propagation when selecting columns with ``DataFrame.__getitem__`` (:issue:`28283`) - Bug in :meth:`Index.union` behaving differently depending on whether operand is an :class:`Index` or other list-like (:issue:`36384`) +- Bug in :meth:`Index.intersection` with non-matching numeric dtypes casting to ``object`` dtype instead of minimal common dtype (:issue:`38122`) - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError`` rather than a bare ``Exception`` (:issue:`35744`) - Bug in ``dir`` where ``dir(obj)`` wouldn't show attributes defined on the instance for pandas objects (:issue:`37173`) - Bug in :meth:`RangeIndex.difference` returning :class:`Int64Index` in some cases where it should return :class:`RangeIndex` (:issue:`38028`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3f89b0619e600..09fe885e47754 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -33,6 +33,7 @@ from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.cast import ( + find_common_type, maybe_cast_to_integer_array, validate_numeric_casting, ) @@ -2826,8 +2827,9 @@ def intersection(self, other, sort=False): return self._get_reconciled_name_object(other) if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype("O") - other = other.astype("O") + dtype = find_common_type([self.dtype, other.dtype]) + this = self.astype(dtype) + other = other.astype(dtype) return this.intersection(other, sort=sort) result = self._intersection(other, sort=sort) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 589da4a6c4ceb..4aedf03ca1800 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3717,16 +3717,14 @@ def _convert_can_do_setop(self, other): if not isinstance(other, Index): if len(other) == 0: - other = MultiIndex( - levels=[[]] * self.nlevels, - codes=[[]] * self.nlevels, - verify_integrity=False, - ) + return self[:0], self.names else: msg = "other must be a MultiIndex or a list of tuples" try: other = MultiIndex.from_tuples(other) - except TypeError as err: + except (ValueError, TypeError) as err: + # ValueError raised by tupels_to_object_array if we + # have non-object dtype raise TypeError(msg) from err else: result_names = get_unanimous_names(self, other) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 2ac57f1befd57..51538c556de15 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import MultiIndex, Series +from pandas import Index, MultiIndex, Series import pandas._testing as tm @@ -294,6 +294,24 @@ def test_intersection(idx, sort): # assert result.equals(tuples) +def test_intersection_non_object(idx, sort): + other = Index(range(3), name="foo") + + result = idx.intersection(other, sort=sort) + expected = MultiIndex(levels=idx.levels, codes=[[]] * idx.nlevels, names=None) + tm.assert_index_equal(result, expected, exact=True) + + # if we pass a length-0 ndarray (i.e. no name, we retain our idx.name) + result = idx.intersection(np.asarray(other)[:0], sort=sort) + expected = MultiIndex(levels=idx.levels, codes=[[]] * idx.nlevels, names=idx.names) + tm.assert_index_equal(result, expected, exact=True) + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + # With non-zero length non-index, we try and fail to convert to tuples + idx.intersection(np.asarray(other), sort=sort) + + def test_intersect_equal_sort(): # GH-24959 idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index 1fd41b017221b..5623b0904c0d5 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -3,11 +3,40 @@ import numpy as np import pytest -from pandas import Index, Int64Index, RangeIndex +from pandas import Index, Int64Index, RangeIndex, UInt64Index import pandas._testing as tm class TestRangeIndexSetOps: + @pytest.mark.parametrize("klass", [RangeIndex, Int64Index, UInt64Index]) + def test_intersection_mismatched_dtype(self, klass): + # check that we cast to float, not object + index = RangeIndex(start=0, stop=20, step=2, name="foo") + index = klass(index) + + flt = index.astype(np.float64) + + # bc index.equals(flt), we go through fastpath and get RangeIndex back + result = index.intersection(flt) + tm.assert_index_equal(result, index, exact=True) + + result = flt.intersection(index) + tm.assert_index_equal(result, flt, exact=True) + + # neither empty, not-equals + result = index.intersection(flt[1:]) + tm.assert_index_equal(result, flt[1:], exact=True) + + result = flt[1:].intersection(index) + tm.assert_index_equal(result, flt[1:], exact=True) + + # empty other + result = index.intersection(flt[:0]) + tm.assert_index_equal(result, flt[:0], exact=True) + + result = flt[:0].intersection(index) + tm.assert_index_equal(result, flt[:0], exact=True) + def test_intersection(self, sort): # intersect with Int64Index index = RangeIndex(start=0, stop=20, step=2) From 59710bcd85ab8982da1bb26af0db7575a2c3565f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 29 Nov 2020 10:23:46 -0800 Subject: [PATCH 10/15] CLN: remove unnecesary cast.maybe_convert_objects (#38144) --- pandas/core/dtypes/cast.py | 52 ------------------- pandas/core/groupby/generic.py | 6 +-- .../tests/dtypes/cast/test_convert_objects.py | 12 ----- 3 files changed, 3 insertions(+), 67 deletions(-) delete mode 100644 pandas/tests/dtypes/cast/test_convert_objects.py diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index fe40bc42887c4..27c5527536057 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -99,7 +99,6 @@ from pandas import Series from pandas.core.arrays import ExtensionArray from pandas.core.indexes.base import Index - from pandas.core.indexes.datetimes import DatetimeIndex _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max @@ -1121,57 +1120,6 @@ def astype_nansafe( return arr.view(dtype) -def maybe_convert_objects( - values: np.ndarray, convert_numeric: bool = True -) -> Union[np.ndarray, "DatetimeIndex"]: - """ - If we have an object dtype array, try to coerce dates and/or numbers. - - Parameters - ---------- - values : ndarray - convert_numeric : bool, default True - - Returns - ------- - ndarray or DatetimeIndex - """ - validate_bool_kwarg(convert_numeric, "convert_numeric") - - orig_values = values - - # convert dates - if is_object_dtype(values.dtype): - values = lib.maybe_convert_objects(values, convert_datetime=True) - - # convert timedeltas - if is_object_dtype(values.dtype): - values = lib.maybe_convert_objects(values, convert_timedelta=True) - - # convert to numeric - if is_object_dtype(values.dtype): - if convert_numeric: - try: - new_values = lib.maybe_convert_numeric( - values, set(), coerce_numeric=True - ) - except (ValueError, TypeError): - pass - else: - # if we are all nans then leave me alone - if not isna(new_values).all(): - values = new_values - - else: - # soft-conversion - values = lib.maybe_convert_objects(values) - - if values is orig_values: - values = values.copy() - - return values - - def soft_convert_objects( values: np.ndarray, datetime: bool = True, diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 244c47cd1f1ea..b9226732d5a69 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -37,7 +37,6 @@ find_common_type, maybe_cast_result, maybe_cast_result_dtype, - maybe_convert_objects, maybe_downcast_numeric, ) from pandas.core.dtypes.common import ( @@ -1867,8 +1866,9 @@ def _recast_datetimelike_result(result: DataFrame) -> DataFrame: # See GH#26285 for n in obj_cols: - converted = maybe_convert_objects( - result.iloc[:, n].values, convert_numeric=False + values = result.iloc[:, n].values + converted = lib.maybe_convert_objects( + values, convert_datetime=True, convert_timedelta=True ) result.iloc[:, n] = converted diff --git a/pandas/tests/dtypes/cast/test_convert_objects.py b/pandas/tests/dtypes/cast/test_convert_objects.py deleted file mode 100644 index a28d554acd312..0000000000000 --- a/pandas/tests/dtypes/cast/test_convert_objects.py +++ /dev/null @@ -1,12 +0,0 @@ -import numpy as np -import pytest - -from pandas.core.dtypes.cast import maybe_convert_objects - - -@pytest.mark.parametrize("data", [[1, 2], ["apply", "banana"]]) -def test_maybe_convert_objects_copy(data): - arr = np.array(data) - out = maybe_convert_objects(arr) - - assert arr is not out From 47d0da67be1ab53c92863365a72a4936be281442 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 29 Nov 2020 19:11:29 +0000 Subject: [PATCH 11/15] API: membership checks on ExtensionArray containing NA values (#37867) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/base.py | 18 +++++++++++++++++ pandas/tests/extension/arrow/test_bool.py | 4 ++++ pandas/tests/extension/base/interface.py | 23 ++++++++++++++++++++++ pandas/tests/extension/decimal/array.py | 8 ++++++++ pandas/tests/extension/json/test_json.py | 7 +++++++ pandas/tests/extension/test_categorical.py | 22 +++++++++++++++++++++ 7 files changed, 83 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3b1d1b4f241b5..873437d917515 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -770,6 +770,7 @@ ExtensionArray - Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`) - Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`) - Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`) +- Fixed a bug where a ``TypeError`` was wrongly raised if a membership check was made on an ``ExtensionArray`` containing nan-like values (:issue:`37867`) Other ^^^^^ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 448025e05422d..76b7877b0ac70 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -37,6 +37,7 @@ is_array_like, is_dtype_equal, is_list_like, + is_scalar, pandas_dtype, ) from pandas.core.dtypes.dtypes import ExtensionDtype @@ -354,6 +355,23 @@ def __iter__(self): for i in range(len(self)): yield self[i] + def __contains__(self, item) -> bool: + """ + Return for `item in self`. + """ + # GH37867 + # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] + # would raise a TypeError. The implementation below works around that. + if is_scalar(item) and isna(item): + if not self._can_hold_na: + return False + elif item is self.dtype.na_value or isinstance(item, self.dtype.type): + return self.isna().any() + else: + return False + else: + return (item == self).any() + def __eq__(self, other: Any) -> ArrayLike: """ Return for `self == other` (element-wise equality). diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 12426a0c92c55..922b3b94c16c1 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -50,6 +50,10 @@ def test_view(self, data): # __setitem__ does not work, so we only have a smoke-test data.view() + @pytest.mark.xfail(raises=AssertionError, reason="Not implemented yet") + def test_contains(self, data, data_missing, nulls_fixture): + super().test_contains(data, data_missing, nulls_fixture) + class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): def test_from_dtype(self, data): diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 9ae4b01508d79..d7997310dde3d 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -29,6 +29,29 @@ def test_can_hold_na_valid(self, data): # GH-20761 assert data._can_hold_na is True + def test_contains(self, data, data_missing, nulls_fixture): + # GH-37867 + # Tests for membership checks. Membership checks for nan-likes is tricky and + # the settled on rule is: `nan_like in arr` is True if nan_like is + # arr.dtype.na_value and arr.isna().any() is True. Else the check returns False. + + na_value = data.dtype.na_value + # ensure data without missing values + data = data[~data.isna()] + + # first elements are non-missing + assert data[0] in data + assert data_missing[0] in data_missing + + # check the presence of na_value + assert na_value in data_missing + assert na_value not in data + + if nulls_fixture is not na_value: + # the data can never contain other nan-likes than na_value + assert nulls_fixture not in data + assert nulls_fixture not in data_missing + def test_memory_usage(self, data): s = pd.Series(data) result = s.memory_usage(index=False) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 9ede9c7fbd0fd..a713550dafa5c 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -155,6 +155,14 @@ def __setitem__(self, key, value): def __len__(self) -> int: return len(self._data) + def __contains__(self, item) -> bool: + if not isinstance(item, decimal.Decimal): + return False + elif item.is_nan(): + return self.isna().any() + else: + return super().__contains__(item) + @property def nbytes(self) -> int: n = len(self) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 74ca341e27bf8..3a5e49796c53b 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -143,6 +143,13 @@ def test_custom_asserts(self): with pytest.raises(AssertionError, match=msg): self.assert_frame_equal(a.to_frame(), b.to_frame()) + @pytest.mark.xfail( + reason="comparison method not implemented for JSONArray (GH-37867)" + ) + def test_contains(self, data): + # GH-37867 + super().test_contains(data) + class TestConstructors(BaseJSON, base.BaseConstructorsTests): @pytest.mark.skip(reason="not implemented constructor from dtype") diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 95f338cbc3240..d03a9ab6b2588 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -87,6 +87,28 @@ def test_memory_usage(self, data): # Is this deliberate? super().test_memory_usage(data) + def test_contains(self, data, data_missing, nulls_fixture): + # GH-37867 + # na value handling in Categorical.__contains__ is deprecated. + # See base.BaseInterFaceTests.test_contains for more details. + + na_value = data.dtype.na_value + # ensure data without missing values + data = data[~data.isna()] + + # first elements are non-missing + assert data[0] in data + assert data_missing[0] in data_missing + + # check the presence of na_value + assert na_value in data_missing + assert na_value not in data + + # Categoricals can contain other nan-likes than na_value + if nulls_fixture is not na_value: + assert nulls_fixture not in data + assert nulls_fixture in data_missing # this line differs from super method + class TestConstructors(base.BaseConstructorsTests): pass From 4a35f2d6ecd1bea4e064384c48346aaf245188ff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 29 Nov 2020 20:12:54 +0100 Subject: [PATCH 12/15] ENH: include conversion to nullable float in convert_dtypes() (#38117) --- pandas/core/dtypes/cast.py | 32 ++++++++++++++- pandas/core/generic.py | 41 ++++++++++++++----- pandas/core/series.py | 9 +++- .../series/methods/test_convert_dtypes.py | 35 ++++++++++++---- 4 files changed, 96 insertions(+), 21 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 27c5527536057..3c4c811c94534 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1196,6 +1196,7 @@ def convert_dtypes( convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, + convert_floating: bool = True, ) -> Dtype: """ Convert objects to best possible type, and optionally, @@ -1210,6 +1211,10 @@ def convert_dtypes( Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. + convert_floating : bool, defaults True + Whether, if possible, conversion can be done to floating extension types. + If `convert_integer` is also True, preference will be give to integer + dtypes if the floats can be faithfully casted to integers. Returns ------- @@ -1217,7 +1222,9 @@ def convert_dtypes( new dtype """ is_extension = is_extension_array_dtype(input_array.dtype) - if (convert_string or convert_integer or convert_boolean) and not is_extension: + if ( + convert_string or convert_integer or convert_boolean or convert_floating + ) and not is_extension: try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: @@ -1245,6 +1252,29 @@ def convert_dtypes( if is_integer_dtype(inferred_dtype): inferred_dtype = input_array.dtype + if convert_floating: + if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( + input_array.dtype + ): + from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE + + inferred_float_dtype = FLOAT_STR_TO_DTYPE.get( + input_array.dtype.name, "Float64" + ) + # if we could also convert to integer, check if all floats + # are actually integers + if convert_integer: + arr = input_array[notna(input_array)] + if (arr.astype(int) == arr).all(): + inferred_dtype = "Int64" + else: + inferred_dtype = inferred_float_dtype + else: + inferred_dtype = inferred_float_dtype + else: + if is_float_dtype(inferred_dtype): + inferred_dtype = input_array.dtype + if convert_boolean: if is_bool_dtype(input_array.dtype): inferred_dtype = "boolean" diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c7448cf8f8e40..c9f862d136477 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6088,6 +6088,7 @@ def convert_dtypes( convert_string: bool_t = True, convert_integer: bool_t = True, convert_boolean: bool_t = True, + convert_floating: bool_t = True, ) -> FrameOrSeries: """ Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. @@ -6104,6 +6105,12 @@ def convert_dtypes( Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. + convert_floating : bool, defaults True + Whether, if possible, conversion can be done to floating extension types. + If `convert_integer` is also True, preference will be give to integer + dtypes if the floats can be faithfully casted to integers. + + .. versionadded:: 1.2.0 Returns ------- @@ -6121,19 +6128,25 @@ def convert_dtypes( ----- By default, ``convert_dtypes`` will attempt to convert a Series (or each Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options - ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is - possible to turn off individual conversions to ``StringDtype``, the integer - extension types or ``BooleanDtype``, respectively. + ``convert_string``, ``convert_integer``, ``convert_boolean`` and + ``convert_boolean``, it is possible to turn off individual conversions + to ``StringDtype``, the integer extension types, ``BooleanDtype`` + or floating extension types, respectively. For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference rules as during normal Series/DataFrame construction. Then, if possible, - convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension - type, otherwise leave as ``object``. + convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer + or floating extension type, otherwise leave as ``object``. If the dtype is integer, convert to an appropriate integer extension type. If the dtype is numeric, and consists of all integers, convert to an - appropriate integer extension type. + appropriate integer extension type. Otherwise, convert to an + appropriate floating extension type. + + .. versionchanged:: 1.2 + Starting with pandas 1.2, this method also converts float columns + to the nullable floating extension type. In the future, as new dtypes are added that support ``pd.NA``, the results of this method will change to support those new dtypes. @@ -6173,7 +6186,7 @@ def convert_dtypes( >>> dfn = df.convert_dtypes() >>> dfn a b c d e f - 0 1 x True h 10 NaN + 0 1 x True h 10 1 2 y False i 100.5 2 3 z 20 200.0 @@ -6183,7 +6196,7 @@ def convert_dtypes( c boolean d string e Int64 - f float64 + f Float64 dtype: object Start with a Series of strings and missing data represented by ``np.nan``. @@ -6205,12 +6218,20 @@ def convert_dtypes( """ if self.ndim == 1: return self._convert_dtypes( - infer_objects, convert_string, convert_integer, convert_boolean + infer_objects, + convert_string, + convert_integer, + convert_boolean, + convert_floating, ) else: results = [ col._convert_dtypes( - infer_objects, convert_string, convert_integer, convert_boolean + infer_objects, + convert_string, + convert_integer, + convert_boolean, + convert_floating, ) for col_name, col in self.items() ] diff --git a/pandas/core/series.py b/pandas/core/series.py index d493ac0a8c051..1f4221206e5bc 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4706,6 +4706,7 @@ def _convert_dtypes( convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, + convert_floating: bool = True, ) -> "Series": input_series = self if infer_objects: @@ -4713,9 +4714,13 @@ def _convert_dtypes( if is_object_dtype(input_series): input_series = input_series.copy() - if convert_string or convert_integer or convert_boolean: + if convert_string or convert_integer or convert_boolean or convert_floating: inferred_dtype = convert_dtypes( - input_series._values, convert_string, convert_integer, convert_boolean + input_series._values, + convert_string, + convert_integer, + convert_boolean, + convert_floating, ) try: result = input_series.astype(inferred_dtype) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index d44667b258414..920182a99e9ef 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -58,9 +58,17 @@ [10, np.nan, 20], np.dtype("float"), "Int64", - {("convert_integer", False): np.dtype("float")}, + { + ("convert_integer", False, "convert_floating", True): "Float64", + ("convert_integer", False, "convert_floating", False): np.dtype("float"), + }, + ), + ( + [np.nan, 100.5, 200], + np.dtype("float"), + "Float64", + {("convert_floating", False): np.dtype("float")}, ), - ([np.nan, 100.5, 200], np.dtype("float"), np.dtype("float"), {}), ( [3, 4, 5], "Int8", @@ -85,20 +93,30 @@ "Int8", {("convert_integer", False): np.dtype("i1")}, ), + ( + [1.2, 1.3], + np.dtype("float32"), + "Float32", + {("convert_floating", False): np.dtype("float32")}, + ), ( [1, 2.0], object, "Int64", { - ("convert_integer", False): np.dtype("float"), + ("convert_integer", False): "Float64", + ("convert_integer", False, "convert_floating", False): np.dtype("float"), ("infer_objects", False): np.dtype("object"), }, ), ( [1, 2.5], object, - np.dtype("float"), - {("infer_objects", False): np.dtype("object")}, + "Float64", + { + ("convert_floating", False): np.dtype("float"), + ("infer_objects", False): np.dtype("object"), + }, ), (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}), ( @@ -134,7 +152,7 @@ class TestSeriesConvertDtypes: "data, maindtype, expected_default, expected_other", test_cases, ) - @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) + @pytest.mark.parametrize("params", product(*[(True, False)] * 5)) def test_convert_dtypes( self, data, maindtype, params, expected_default, expected_other ): @@ -150,12 +168,13 @@ def test_convert_dtypes( "convert_string", "convert_integer", "convert_boolean", + "convert_floating", ] params_dict = dict(zip(param_names, params)) expected_dtype = expected_default - for (key, val), dtype in expected_other.items(): - if params_dict[key] is val: + for spec, dtype in expected_other.items(): + if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])): expected_dtype = dtype expected = pd.Series(data, dtype=expected_dtype) From f65f0d3edb275faf37435ba1fa2780240b105b48 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 29 Nov 2020 11:18:39 -0800 Subject: [PATCH 13/15] DEPR: ExtensionOpsMixin -> OpsMixin (#38142) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/base.py | 16 +++++++ pandas/tests/arrays/test_deprecations.py | 19 ++++++++ pandas/tests/extension/decimal/array.py | 44 ++++++++++++++++--- .../tests/extension/decimal/test_decimal.py | 7 +-- 5 files changed, 76 insertions(+), 11 deletions(-) create mode 100644 pandas/tests/arrays/test_deprecations.py diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 873437d917515..bb06bcc9b5aa8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -492,6 +492,7 @@ Deprecations - Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`) - The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) - The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) +- :class:`ExtensionOpsMixin` and :class:`ExtensionScalarOpsMixin` are deprecated and will be removed in a future version. Use ``pd.core.arraylike.OpsMixin`` instead (:issue:`37080`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 76b7877b0ac70..e3469bba23ccd 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -21,6 +21,7 @@ Union, cast, ) +import warnings import numpy as np @@ -1237,6 +1238,21 @@ class ExtensionOpsMixin: with NumPy arrays. """ + def __init_subclass__(cls, **kwargs): + # We use __init_subclass__ to handle deprecations + super().__init_subclass__() + + if cls.__name__ != "ExtensionScalarOpsMixin": + # We only want to warn for user-defined subclasses, + # and cannot reference ExtensionScalarOpsMixin directly at this point. + warnings.warn( + "ExtensionOpsMixin and ExtensionScalarOpsMixin are deprecated " + "and will be removed in a future version. Use " + "pd.core.arraylike.OpsMixin instead.", + FutureWarning, + stacklevel=2, + ) + @classmethod def _create_arithmetic_method(cls, op): raise AbstractMethodError(cls) diff --git a/pandas/tests/arrays/test_deprecations.py b/pandas/tests/arrays/test_deprecations.py new file mode 100644 index 0000000000000..7e80072e8794f --- /dev/null +++ b/pandas/tests/arrays/test_deprecations.py @@ -0,0 +1,19 @@ +import pandas._testing as tm +from pandas.core.arrays import ( + ExtensionArray, + ExtensionOpsMixin, + ExtensionScalarOpsMixin, +) + + +def test_extension_ops_mixin_deprecated(): + # GH#37080 deprecated in favor of OpsMixin + with tm.assert_produces_warning(FutureWarning): + + class MySubclass(ExtensionOpsMixin, ExtensionArray): + pass + + with tm.assert_produces_warning(FutureWarning): + + class MyOtherSubclass(ExtensionScalarOpsMixin, ExtensionArray): + pass diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index a713550dafa5c..d7bdca4b218b5 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -7,12 +7,13 @@ import numpy as np from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import maybe_cast_to_extension_array from pandas.core.dtypes.common import is_dtype_equal, is_list_like, pandas_dtype import pandas as pd from pandas.api.extensions import no_default, register_extension_dtype from pandas.core.arraylike import OpsMixin -from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin +from pandas.core.arrays import ExtensionArray from pandas.core.indexers import check_array_indexer @@ -45,7 +46,7 @@ def _is_numeric(self) -> bool: return True -class DecimalArray(OpsMixin, ExtensionScalarOpsMixin, ExtensionArray): +class DecimalArray(OpsMixin, ExtensionArray): __array_priority__ = 1000 def __init__(self, values, dtype=None, copy=False, context=None): @@ -225,6 +226,42 @@ def convert_values(param): return np.asarray(res, dtype=bool) + _do_coerce = True # overriden in DecimalArrayWithoutCoercion + + def _arith_method(self, other, op): + def convert_values(param): + if isinstance(param, ExtensionArray) or is_list_like(param): + ovalues = param + else: # Assume its an object + ovalues = [param] * len(self) + return ovalues + + lvalues = self + rvalues = convert_values(other) + + # If the operator is not defined for the underlying objects, + # a TypeError should be raised + res = [op(a, b) for (a, b) in zip(lvalues, rvalues)] + + def _maybe_convert(arr): + if self._do_coerce: + # https://github.com/pandas-dev/pandas/issues/22850 + # We catch all regular exceptions here, and fall back + # to an ndarray. + res = maybe_cast_to_extension_array(type(self), arr) + if not isinstance(res, type(self)): + # exception raised in _from_sequence; ensure we have ndarray + res = np.asarray(arr) + else: + res = np.asarray(arr) + return res + + if op.__name__ in {"divmod", "rdivmod"}: + a, b = zip(*res) + return _maybe_convert(a), _maybe_convert(b) + + return _maybe_convert(res) + def to_decimal(values, context=None): return DecimalArray([decimal.Decimal(x) for x in values], context=context) @@ -232,6 +269,3 @@ def to_decimal(values, context=None): def make_data(): return [decimal.Decimal(random.random()) for _ in range(100)] - - -DecimalArray._add_arithmetic_ops() diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 233b658d29782..c3e84f75ebe68 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -335,12 +335,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): class DecimalArrayWithoutCoercion(DecimalArrayWithoutFromSequence): - @classmethod - def _create_arithmetic_method(cls, op): - return cls._create_method(op, coerce_to_dtype=False) - - -DecimalArrayWithoutCoercion._add_arithmetic_ops() + _do_coerce = False def test_combine_from_sequence_raises(): From d98b37dd5604844e7ce9eeb4d7e1db7efd86c07e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 29 Nov 2020 11:25:08 -0800 Subject: [PATCH 14/15] REF: use np.where instead of maybe_upcast_putmask in nanops (#38130) --- pandas/core/nanops.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 80c4cd5b44a92..88662a4fabed8 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -12,7 +12,6 @@ from pandas._typing import ArrayLike, Dtype, DtypeObj, F, Scalar from pandas.compat._optional import import_optional_dependency -from pandas.core.dtypes.cast import maybe_upcast_putmask from pandas.core.dtypes.common import ( get_dtype, is_any_int_dtype, @@ -284,7 +283,7 @@ def _get_values( """ # In _get_values is only called from within nanops, and in all cases # with scalar fill_value. This guarantee is important for the - # maybe_upcast_putmask call below + # np.where call below assert is_scalar(fill_value) values = extract_array(values, extract_numpy=True) @@ -292,10 +291,12 @@ def _get_values( dtype = values.dtype + datetimelike = False if needs_i8_conversion(values.dtype): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = np.asarray(values.view("i8")) + datetimelike = True dtype_ok = _na_ok_dtype(dtype) @@ -306,13 +307,13 @@ def _get_values( ) if skipna and (mask is not None) and (fill_value is not None): - values = values.copy() - if dtype_ok and mask.any(): - np.putmask(values, mask, fill_value) - - # promote if needed - else: - values, _ = maybe_upcast_putmask(values, mask, fill_value) + if mask.any(): + if dtype_ok or datetimelike: + values = values.copy() + np.putmask(values, mask, fill_value) + else: + # np.where will promote if needed + values = np.where(~mask, values, fill_value) # return a platform independent precision dtype dtype_max = dtype From f04ec866120f134321fd63f0f0d170a8b8c98591 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 29 Nov 2020 11:37:00 -0800 Subject: [PATCH 15/15] CLN: remove unreachable in maybe_cast_result (#38152) --- pandas/core/dtypes/cast.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 3c4c811c94534..08e7671e0b674 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -296,7 +296,9 @@ def trans(x): return result -def maybe_cast_result(result, obj: "Series", numeric_only: bool = False, how: str = ""): +def maybe_cast_result( + result: ArrayLike, obj: "Series", numeric_only: bool = False, how: str = "" +) -> ArrayLike: """ Try casting result to a different type if appropriate @@ -319,19 +321,20 @@ def maybe_cast_result(result, obj: "Series", numeric_only: bool = False, how: st dtype = obj.dtype dtype = maybe_cast_result_dtype(dtype, how) - if not is_scalar(result): - if ( - is_extension_array_dtype(dtype) - and not is_categorical_dtype(dtype) - and dtype.kind != "M" - ): - # We have to special case categorical so as not to upcast - # things like counts back to categorical - cls = dtype.construct_array_type() - result = maybe_cast_to_extension_array(cls, result, dtype=dtype) + assert not is_scalar(result) + + if ( + is_extension_array_dtype(dtype) + and not is_categorical_dtype(dtype) + and dtype.kind != "M" + ): + # We have to special case categorical so as not to upcast + # things like counts back to categorical + cls = dtype.construct_array_type() + result = maybe_cast_to_extension_array(cls, result, dtype=dtype) - elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: - result = maybe_downcast_to_dtype(result, dtype) + elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: + result = maybe_downcast_to_dtype(result, dtype) return result