From d4c8d82b52045f49a0bb1d762968918a06886ae9 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 22 Dec 2023 11:18:49 -0800 Subject: [PATCH 001/396] RLS: 2.2.0rc0 From b59e594f6af21f04e33697bf6bd4cbcf02b8cff2 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 24 Dec 2023 17:46:25 +0100 Subject: [PATCH 002/396] Backport PR #56595 on branch 2.2.x (TST/CLN: Inline seldom used fixture) (#56612) Backport PR #56595: TST/CLN: Inline seldom used fixture Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/arrays/categorical/conftest.py | 9 --------- pandas/tests/arrays/categorical/test_api.py | 3 ++- pandas/tests/arrays/categorical/test_indexing.py | 6 ++++-- pandas/tests/arrays/categorical/test_operators.py | 3 ++- pandas/tests/arrays/categorical/test_repr.py | 3 ++- pandas/tests/indexes/datetimes/test_ops.py | 3 +-- pandas/tests/tseries/offsets/conftest.py | 13 ------------- pandas/tests/tseries/offsets/test_common.py | 3 ++- 8 files changed, 13 insertions(+), 30 deletions(-) delete mode 100644 pandas/tests/arrays/categorical/conftest.py delete mode 100644 pandas/tests/tseries/offsets/conftest.py diff --git a/pandas/tests/arrays/categorical/conftest.py b/pandas/tests/arrays/categorical/conftest.py deleted file mode 100644 index 37249210f28f4..0000000000000 --- a/pandas/tests/arrays/categorical/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -import pytest - -from pandas import Categorical - - -@pytest.fixture -def factor(): - """Fixture returning a Categorical object""" - return Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index b4215b4a6fe21..a939ee5f6f53f 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -385,7 +385,8 @@ def test_remove_unused_categories(self): class TestCategoricalAPIWithFactor: - def test_describe(self, factor): + def test_describe(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # string type desc = factor.describe() assert factor.ordered diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 3377c411a7084..5e1c5c64fa660 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -21,7 +21,8 @@ class TestCategoricalIndexingWithFactor: - def test_getitem(self, factor): + def test_getitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) assert factor[0] == "a" assert factor[-1] == "c" @@ -31,7 +32,8 @@ def test_getitem(self, factor): subf = factor[np.asarray(factor) == "c"] tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8)) - def test_setitem(self, factor): + def test_setitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # int/positional c = factor.copy() c[0] = "b" diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 16b941eab4830..4174d2adc810b 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -17,7 +17,8 @@ def test_categories_none_comparisons(self): factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(factor, factor) - def test_comparisons(self, factor): + def test_comparisons(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) result = factor[factor == "a"] expected = factor[np.asarray(factor) == "a"] tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index d6f93fbbd912f..ef0315130215c 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -17,7 +17,8 @@ class TestCategoricalReprWithFactor: - def test_print(self, factor, using_infer_string): + def test_print(self, using_infer_string): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) if using_infer_string: expected = [ "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 5db0aa5cf510f..bac9548b932c1 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -10,8 +10,6 @@ ) import pandas._testing as tm -START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) - class TestDatetimeIndexOps: def test_infer_freq(self, freq_sample): @@ -26,6 +24,7 @@ def test_infer_freq(self, freq_sample): class TestBusinessDatetimeIndex: @pytest.fixture def rng(self, freq): + START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) return bdate_range(START, END, freq=freq) def test_comparison(self, rng): diff --git a/pandas/tests/tseries/offsets/conftest.py b/pandas/tests/tseries/offsets/conftest.py deleted file mode 100644 index 2fc846353dcb5..0000000000000 --- a/pandas/tests/tseries/offsets/conftest.py +++ /dev/null @@ -1,13 +0,0 @@ -import datetime - -import pytest - -from pandas._libs.tslibs import Timestamp - - -@pytest.fixture -def dt(): - """ - Fixture for common Timestamp. - """ - return Timestamp(datetime.datetime(2008, 1, 2)) diff --git a/pandas/tests/tseries/offsets/test_common.py b/pandas/tests/tseries/offsets/test_common.py index 5b80b8b1c4ab4..aa4e22f71ad66 100644 --- a/pandas/tests/tseries/offsets/test_common.py +++ b/pandas/tests/tseries/offsets/test_common.py @@ -250,7 +250,8 @@ def test_sub(date, offset_box, offset2): [BusinessHour, BusinessHour()], ], ) -def test_Mult1(offset_box, offset1, dt): +def test_Mult1(offset_box, offset1): + dt = Timestamp(2008, 1, 2) assert dt + 10 * offset1 == dt + offset_box(10) assert dt + 5 * offset1 == dt + offset_box(5) From 8f8b514b645db913a3b671b19b32eb7f67808df0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 26 Dec 2023 20:32:52 +0100 Subject: [PATCH 003/396] Backport PR #56615 on branch 2.2.x (CI: Fix deprecation warnings) (#56620) Backport PR #56615: CI: Fix deprecation warnings Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/io/parser/common/test_chunksize.py | 5 +++-- pandas/tests/io/parser/common/test_read_errors.py | 2 +- pandas/tests/io/test_parquet.py | 4 +--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 5e47bcc1c5b0e..9660b283a491b 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -223,7 +223,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): warn = None if parser.engine == "pyarrow": warn = DeprecationWarning - depr_msg = "Passing a BlockManager to DataFrame" + depr_msg = "Passing a BlockManager to DataFrame|make_block is deprecated" with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): with monkeypatch.context() as m: m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) @@ -254,7 +254,8 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): if parser.engine == "pyarrow": df = parser.read_csv_check_warnings( DeprecationWarning, - "Passing a BlockManager to DataFrame is deprecated", + "Passing a BlockManager to DataFrame is deprecated|" + "make_block is deprecated", buf, check_stacklevel=False, ) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 4a4ae2b259289..db8b586d22fc0 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -171,7 +171,7 @@ def test_suppress_error_output(all_parsers): warn = None if parser.engine == "pyarrow": warn = DeprecationWarning - msg = "Passing a BlockManager to DataFrame" + msg = "Passing a BlockManager to DataFrame|make_block is deprecated" with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): result = parser.read_csv(StringIO(data), on_bad_lines="skip") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ad7cdad363e78..e4b94177eedb2 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1000,9 +1000,7 @@ def test_filter_row_groups(self, pa): df = pd.DataFrame({"a": list(range(3))}) with tm.ensure_clean() as path: df.to_parquet(path, engine=pa) - result = read_parquet( - path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False - ) + result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 def test_read_parquet_manager(self, pa, using_array_manager): From 18aa834fbac273b1ecfd50d6469cf5bf579bcac8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 27 Dec 2023 00:50:47 +0100 Subject: [PATCH 004/396] Backport PR #56617 on branch 2.2.x (TYP: some return types from ruff) (#56624) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #56617: TYP: some return types from ruff Co-authored-by: Torsten Wörtwein --- .pre-commit-config.yaml | 2 ++ doc/source/whatsnew/v2.2.0.rst | 2 +- environment.yml | 2 +- pandas/_testing/asserters.py | 7 ++++--- pandas/_version.py | 2 +- pandas/core/computation/expr.py | 4 ++-- pandas/io/html.py | 8 ++++---- pandas/io/json/_json.py | 10 +++++----- pandas/io/parsers/arrow_parser_wrapper.py | 6 +++--- pandas/io/pytables.py | 2 +- pandas/io/sas/sas_xport.py | 2 +- pandas/io/sql.py | 8 ++++---- pandas/io/stata.py | 2 +- pandas/plotting/_matplotlib/core.py | 8 ++++---- pandas/util/_validators.py | 6 +++--- requirements-dev.txt | 2 +- 16 files changed, 38 insertions(+), 35 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2a070e9a49b97..7f3fc95ce00cc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,8 @@ repos: # TODO: remove autofixe-only rules when they are checked by ruff name: ruff-selected-autofixes alias: ruff-selected-autofixes + files: ^pandas + exclude: ^pandas/tests args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] - repo: https://github.com/jendrikseipp/vulture rev: 'v2.10' diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d1481639ca5a0..5ee94b74c527e 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -431,7 +431,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | Package | Minimum Version | Changed | +=================+=================+=========+ -| mypy (dev) | 1.7.1 | X | +| mypy (dev) | 1.8.0 | X | +-----------------+-----------------+---------+ | | | X | +-----------------+-----------------+---------+ diff --git a/environment.yml b/environment.yml index 74317d47e2e53..58eb69ad1f070 100644 --- a/environment.yml +++ b/environment.yml @@ -76,7 +76,7 @@ dependencies: # code checks - flake8=6.1.0 # run in subprocess over docstring examples - - mypy=1.7.1 # pre-commit uses locally installed mypy + - mypy=1.8.0 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py - pre-commit>=3.6.0 diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index e342f76dc724b..800b03707540f 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -4,6 +4,7 @@ from typing import ( TYPE_CHECKING, Literal, + NoReturn, cast, ) @@ -143,7 +144,7 @@ def assert_almost_equal( ) -def _check_isinstance(left, right, cls): +def _check_isinstance(left, right, cls) -> None: """ Helper method for our assert_* methods that ensures that the two objects being compared have the right type before @@ -576,7 +577,7 @@ def assert_timedelta_array_equal( def raise_assert_detail( obj, message, left, right, diff=None, first_diff=None, index_values=None -): +) -> NoReturn: __tracebackhide__ = True msg = f"""{obj} are different @@ -664,7 +665,7 @@ def _get_base(obj): if left_base is right_base: raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") - def _raise(left, right, err_msg): + def _raise(left, right, err_msg) -> NoReturn: if err_msg is None: if left.shape != right.shape: raise_assert_detail( diff --git a/pandas/_version.py b/pandas/_version.py index 5d610b5e1ea7e..f8a960630126d 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -386,7 +386,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): return pieces -def plus_or_dot(pieces): +def plus_or_dot(pieces) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 4770f403b1bdb..b5861fbaebe9c 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -695,8 +695,8 @@ def visit_Call(self, node, side=None, **kwargs): if not isinstance(key, ast.keyword): # error: "expr" has no attribute "id" raise ValueError( - "keyword error in function call " # type: ignore[attr-defined] - f"'{node.func.id}'" + "keyword error in function call " + f"'{node.func.id}'" # type: ignore[attr-defined] ) if key.arg: diff --git a/pandas/io/html.py b/pandas/io/html.py index 5d5bf079784be..26e71c9546ffd 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -269,7 +269,7 @@ def _attr_getter(self, obj, attr): # Both lxml and BeautifulSoup have the same implementation: return obj.get(attr) - def _href_getter(self, obj): + def _href_getter(self, obj) -> str | None: """ Return a href if the DOM node contains a child or None. @@ -392,7 +392,7 @@ def _parse_tables(self, document, match, attrs): """ raise AbstractMethodError(self) - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: """ Return whether an individual DOM node matches a tag @@ -629,7 +629,7 @@ def _href_getter(self, obj) -> str | None: def _text_getter(self, obj): return obj.text - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.name == tag def _parse_td(self, row): @@ -758,7 +758,7 @@ def _parse_tables(self, document, match, kwargs): raise ValueError(f"No tables found matching regex {repr(pattern)}") return tables - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.tag == tag def _build_doc(self): diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ed66e46b300f7..4c490c6b2cda2 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -255,7 +255,7 @@ def __init__( self.is_copy = None self._format_axes() - def _format_axes(self): + def _format_axes(self) -> None: raise AbstractMethodError(self) def write(self) -> str: @@ -287,7 +287,7 @@ def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: else: return self.obj - def _format_axes(self): + def _format_axes(self) -> None: if not self.obj.index.is_unique and self.orient == "index": raise ValueError(f"Series index must be unique for orient='{self.orient}'") @@ -304,7 +304,7 @@ def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: obj_to_write = self.obj return obj_to_write - def _format_axes(self): + def _format_axes(self) -> None: """ Try to format axes if they are datelike. """ @@ -1193,7 +1193,7 @@ def parse(self): self._try_convert_types() return self.obj - def _parse(self): + def _parse(self) -> None: raise AbstractMethodError(self) @final @@ -1217,7 +1217,7 @@ def _convert_axes(self) -> None: new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False) setattr(self.obj, axis_name, new_axis) - def _try_convert_types(self): + def _try_convert_types(self) -> None: raise AbstractMethodError(self) @final diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 66a7ccacf675b..890b22154648e 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -41,7 +41,7 @@ def __init__(self, src: ReadBuffer[bytes], **kwds) -> None: self._parse_kwds() - def _parse_kwds(self): + def _parse_kwds(self) -> None: """ Validates keywords before passing to pyarrow. """ @@ -104,7 +104,7 @@ def _get_pyarrow_options(self) -> None: ] = None # PyArrow raises an exception by default elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN: - def handle_warning(invalid_row): + def handle_warning(invalid_row) -> str: warnings.warn( f"Expected {invalid_row.expected_columns} columns, but found " f"{invalid_row.actual_columns}: {invalid_row.text}", @@ -219,7 +219,7 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: raise ValueError(e) return frame - def _validate_usecols(self, usecols): + def _validate_usecols(self, usecols) -> None: if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols): raise ValueError( "The pyarrow engine does not allow 'usecols' to be integer " diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 50611197ad7dd..1139519d2bcd3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1707,7 +1707,7 @@ def info(self) -> str: # ------------------------------------------------------------------------ # private methods - def _check_if_open(self): + def _check_if_open(self) -> None: if not self.is_open: raise ClosedFileError(f"{self._path} file is not open!") diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index e68f4789f0a06..11b2ed0ee7316 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -288,7 +288,7 @@ def close(self) -> None: def _get_row(self): return self.filepath_or_buffer.read(80).decode() - def _read_header(self): + def _read_header(self) -> None: self.filepath_or_buffer.seek(0) # read file header diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b0fa6bc6e90c4..3a58daf681cfb 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1514,7 +1514,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: pass @@ -2073,7 +2073,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLTable( table_name, self, @@ -2433,7 +2433,7 @@ def _create_sql_schema( keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: raise NotImplementedError("not implemented for adbc") @@ -2879,7 +2879,7 @@ def _create_sql_schema( keys=None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLiteTable( table_name, self, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0f097c6059c7c..a4d8054ea4f8c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -687,7 +687,7 @@ def __init__( self._prepare_value_labels() - def _prepare_value_labels(self): + def _prepare_value_labels(self) -> None: """Encode value labels.""" self.text_len = 0 diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 479a5e19dc1c5..2979903edf360 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -662,7 +662,7 @@ def _ensure_frame(self, data) -> DataFrame: return data @final - def _compute_plot_data(self): + def _compute_plot_data(self) -> None: data = self.data # GH15079 reconstruct data if by is defined @@ -699,7 +699,7 @@ def _compute_plot_data(self): self.data = numeric_data.apply(type(self)._convert_to_ndarray) - def _make_plot(self, fig: Figure): + def _make_plot(self, fig: Figure) -> None: raise AbstractMethodError(self) @final @@ -745,7 +745,7 @@ def _post_plot_logic(self, ax: Axes, data) -> None: """Post process for each axes. Overridden in child classes""" @final - def _adorn_subplots(self, fig: Figure): + def _adorn_subplots(self, fig: Figure) -> None: """Common post process unrelated to data""" if len(self.axes) > 0: all_axes = self._get_subplots(fig) @@ -1323,7 +1323,7 @@ def __init__( c = self.data.columns[c] self.c = c - def _make_plot(self, fig: Figure): + def _make_plot(self, fig: Figure) -> None: x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index a47f622216ef7..cb0b4d549f49e 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -26,7 +26,7 @@ BoolishNoneT = TypeVar("BoolishNoneT", bool, int, None) -def _check_arg_length(fname, args, max_fname_arg_count, compat_args): +def _check_arg_length(fname, args, max_fname_arg_count, compat_args) -> None: """ Checks whether 'args' has length of at most 'compat_args'. Raises a TypeError if that is not the case, similar to in Python when a @@ -46,7 +46,7 @@ def _check_arg_length(fname, args, max_fname_arg_count, compat_args): ) -def _check_for_default_values(fname, arg_val_dict, compat_args): +def _check_for_default_values(fname, arg_val_dict, compat_args) -> None: """ Check that the keys in `arg_val_dict` are mapped to their default values as specified in `compat_args`. @@ -125,7 +125,7 @@ def validate_args(fname, args, max_fname_arg_count, compat_args) -> None: _check_for_default_values(fname, kwargs, compat_args) -def _check_for_invalid_keys(fname, kwargs, compat_args): +def _check_for_invalid_keys(fname, kwargs, compat_args) -> None: """ Checks whether 'kwargs' contains any keys that are not in 'compat_args' and raises a TypeError if there is one. diff --git a/requirements-dev.txt b/requirements-dev.txt index cbfb6336b2e16..5a63e59e1db88 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -53,7 +53,7 @@ moto flask asv>=0.6.1 flake8==6.1.0 -mypy==1.7.1 +mypy==1.8.0 tokenize-rt pre-commit>=3.6.0 gitpython From d212e1f90aafec8f128582ce8485ffcd9d4f9765 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 27 Dec 2023 20:43:08 +0100 Subject: [PATCH 005/396] Backport PR #56636 on branch 2.2.x (DOC: Fixup CoW userguide) (#56639) Backport PR #56636: DOC: Fixup CoW userguide Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/user_guide/copy_on_write.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst index 050c3901c3420..a083297925007 100644 --- a/doc/source/user_guide/copy_on_write.rst +++ b/doc/source/user_guide/copy_on_write.rst @@ -317,7 +317,7 @@ you are modifying one object inplace. .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df2 = df.reset_index() + df2 = df.reset_index(drop=True) df2.iloc[0, 0] = 100 This creates two objects that share data and thus the setitem operation will trigger a @@ -328,7 +328,7 @@ held by the object. .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df = df.reset_index() + df = df.reset_index(drop=True) df.iloc[0, 0] = 100 No copy is necessary in this example. From 95cc20073c4fc2468dc1396dc031bb9b0ad0ce54 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 27 Dec 2023 20:55:19 +0100 Subject: [PATCH 006/396] Backport PR #56632 on branch 2.2.x (DOC: Minor fixups for 2.2.0 whatsnew) (#56640) Backport PR #56632: DOC: Minor fixups for 2.2.0 whatsnew Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 118 ++++++++++++--------------------- 1 file changed, 43 insertions(+), 75 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 5ee94b74c527e..5b955aa45219a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -123,7 +123,7 @@ nullability handling. with pg_dbapi.connect(uri) as conn: df.to_sql("pandas_table", conn, index=False) - # for roundtripping + # for round-tripping with pg_dbapi.connect(uri) as conn: df2 = pd.read_sql("pandas_table", conn) @@ -176,7 +176,7 @@ leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` .. code-block:: ipython - # for roundtripping + # for round-tripping with pg_dbapi.connect(uri) as conn: df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") @@ -306,22 +306,21 @@ Other enhancements - :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). - :func:`get_dummies` now returning extension dtypes ``boolean`` or ``bool[pyarrow]`` that are compatible with the input dtype (:issue:`56273`) -- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) +- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"`` (:issue:`54480`) - :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`) -- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`) +- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs` (:issue:`54264`) - :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`) +- :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) -- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) -- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`) -- Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`) +- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`) +- Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`) - The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) - .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: @@ -386,6 +385,8 @@ index levels when joining on two indexes with different levels (:issue:`34133`). left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"])) right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"])) + left + right result = left.join(right) *Old Behavior* @@ -415,15 +416,6 @@ Backwards incompatible API changes Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Some minimum supported versions of dependencies were updated. -If installed, we now require: - -+-----------------+-----------------+----------+---------+ -| Package | Minimum Version | Required | Changed | -+=================+=================+==========+=========+ -| | | X | X | -+-----------------+-----------------+----------+---------+ - For `optional libraries `_ the general recommendation is to use the latest version. The following table lists the lowest version per library that is currently being tested throughout the development of pandas. Optional libraries below the lowest tested version may still work, but are not considered supported. @@ -433,8 +425,6 @@ Optional libraries below the lowest tested version may still work, but are not c +=================+=================+=========+ | mypy (dev) | 1.8.0 | X | +-----------------+-----------------+---------+ -| | | X | -+-----------------+-----------------+---------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. @@ -606,20 +596,20 @@ Other Deprecations - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) - Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`) - Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) -- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer``. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer`` (:issue:`54229`) - Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`) - Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) - Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) @@ -692,31 +682,30 @@ Bug fixes Categorical ^^^^^^^^^^^ - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) -- Bug in :meth:`CategoricalDtype.__eq__` returning false for unordered categorical data with mixed types (:issue:`55468`) -- +- Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`) Datetimelike ^^^^^^^^^^^^ - Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`) - Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`) - Bug in :func:`Series.isin` with :class:`DatetimeTZDtype` dtype and comparison values that are all ``NaT`` incorrectly returning all-``False`` even if the series contains ``NaT`` entries (:issue:`56427`) -- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`) +- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame (:issue:`52093`) - Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`) +- Bug in :meth:`.Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`DatetimeIndex.shift` with non-nanosecond resolution incorrectly returning with nanosecond resolution (:issue:`56117`) - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`) - Bug in :meth:`Series.fillna` with non-nanosecond resolution dtypes and higher-resolution vector values returning incorrect (internally-corrupted) results (:issue:`56410`) -- Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`Timestamp.unit` being inferred incorrectly from an ISO8601 format string with minute or hour resolution and a timezone offset (:issue:`56208`) -- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) +- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetime64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`) - Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`) - Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`) -- Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in addition or subtraction of very large :class:`.Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) @@ -739,14 +728,12 @@ Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) -- Conversion ^^^^^^^^^^ - Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) - Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`) - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) -- Strings ^^^^^^^ @@ -763,13 +750,12 @@ Strings Interval ^^^^^^^^ -- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) +- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`) - Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`) - Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`) - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) - Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`) -- Indexing ^^^^^^^^ @@ -781,25 +767,23 @@ Indexing Missing ^^^^^^^ - Bug in :meth:`DataFrame.update` wasn't updating in-place for tz-aware datetime64 dtypes (:issue:`56227`) -- MultiIndex ^^^^^^^^^^ - Bug in :meth:`MultiIndex.get_indexer` not raising ``ValueError`` when ``method`` provided and index is non-monotonic (:issue:`53452`) -- I/O ^^^ -- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified. (:issue:`56323`) -- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified. (:issue:`55677`) -- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) +- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified (:issue:`56323`) +- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified (:issue:`55677`) +- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raising a Python warning; this now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``quotechar`` was ignored (:issue:`52266`) -- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) -- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a CSV with no headers (:issue:`54459`) +- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when the file contains ``NaN`` or ``Inf`` (:issue:`54564`) - Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`) -- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) +- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`) - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) -- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when string contains annotation (:issue:`55200`) +- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`) - Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`) @@ -808,12 +792,11 @@ Period - Bug in :class:`PeriodIndex` construction when more than one of ``data``, ``ordinal`` and ``**fields`` are passed failing to raise ``ValueError`` (:issue:`55961`) - Bug in :class:`Period` addition silently wrapping around instead of raising ``OverflowError`` (:issue:`55503`) - Bug in casting from :class:`PeriodDtype` with ``astype`` to ``datetime64`` or :class:`DatetimeTZDtype` with non-nanosecond unit incorrectly returning with nanosecond unit (:issue:`55958`) -- Plotting ^^^^^^^^ -- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) -- Bug in :meth:`DataFrame.plot.scatter` discaring string columns (:issue:`56142`) +- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a Matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) +- Bug in :meth:`DataFrame.plot.scatter` discarding string columns (:issue:`56142`) - Bug in :meth:`Series.plot` when reusing an ``ax`` object failing to raise when a ``how`` keyword is passed (:issue:`55953`) Groupby/resample/rolling @@ -821,9 +804,9 @@ Groupby/resample/rolling - Bug in :class:`.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_count` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`) @@ -845,22 +828,11 @@ Reshaping - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Bug in :meth:`DataFrame.pivot_table` where the row margin is incorrect when the columns have numeric names (:issue:`26568`) - Bug in :meth:`DataFrame.pivot` with numeric columns and extension dtype for data (:issue:`56528`) -- Bug in :meth:`DataFrame.stack` and :meth:`Series.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) +- Bug in :meth:`DataFrame.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) Sparse ^^^^^^ - Bug in :meth:`SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) -- - -ExtensionArray -^^^^^^^^^^^^^^ -- -- - -Styler -^^^^^^ -- -- Other ^^^^^ @@ -871,15 +843,11 @@ Other - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`) -- Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) +- Bug in rendering ``inf`` values inside a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) -.. ***DO NOT USE THIS SECTION*** - -- -- .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: From f8e9892dd1bdea536b3f7c25eb244d36b05b53d6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 02:25:53 +0100 Subject: [PATCH 007/396] Backport PR #56644 on branch 2.2.x (BUG: Series.to_numpy raising for arrow floats to numpy floats) (#56648) Backport PR #56644: BUG: Series.to_numpy raising for arrow floats to numpy floats Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/arrow/array.py | 11 ++++++++++- pandas/tests/extension/test_arrow.py | 8 ++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 23b5448029dd9..de1ed9ecfdaf1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -37,6 +37,7 @@ CategoricalDtype, is_array_like, is_bool_dtype, + is_float_dtype, is_integer, is_list_like, is_numeric_dtype, @@ -1320,6 +1321,7 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: + original_na_value = na_value dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna) pa_type = self._pa_array.type if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): @@ -1345,7 +1347,14 @@ def to_numpy( if dtype is not None and isna(na_value): na_value = None result = np.full(len(data), fill_value=na_value, dtype=dtype) - elif not data._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan): + elif not data._hasna or ( + pa.types.is_floating(pa_type) + and ( + na_value is np.nan + or original_na_value is lib.no_default + and is_float_dtype(dtype) + ) + ): result = data._pa_array.to_numpy() if dtype is not None: result = result.astype(dtype, copy=False) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3b03272f18203..5624acfb64764 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3153,6 +3153,14 @@ def test_string_to_time_parsing_cast(): tm.assert_series_equal(result, expected) +def test_to_numpy_float(): + # GH#56267 + ser = pd.Series([32, 40, None], dtype="float[pyarrow]") + result = ser.astype("float64") + expected = pd.Series([32, 40, np.nan], dtype="float64") + tm.assert_series_equal(result, expected) + + def test_to_numpy_timestamp_to_int(): # GH 55997 ser = pd.Series(["2020-01-01 04:30:00"], dtype="timestamp[ns][pyarrow]") From 3732cc4150fb58bdfe5c30251a8703f694c694be Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 17:16:27 +0100 Subject: [PATCH 008/396] Backport PR #56650 on branch 2.2.x (ENH: Implement dt methods for pyarrow duration types) (#56656) Backport PR #56650: ENH: Implement dt methods for pyarrow duration types Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 87 ++++++++++++++++++++++ pandas/core/indexes/accessors.py | 39 +++++++++- pandas/tests/extension/test_arrow.py | 105 +++++++++++++++++++++++++++ 4 files changed, 231 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 5b955aa45219a..f7e1cc9cbe36d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -316,6 +316,7 @@ Other enhancements - :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) +- Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`) - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) - Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`) - Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index de1ed9ecfdaf1..32a4cadff8270 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -17,6 +17,7 @@ from pandas._libs import lib from pandas._libs.tslibs import ( + NaT, Timedelta, Timestamp, timezones, @@ -2498,6 +2499,92 @@ def _str_wrap(self, width: int, **kwargs): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) + @property + def _dt_days(self): + return type(self)( + pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) + ) + + @property + def _dt_hours(self): + return type(self)( + pa.array( + [ + td.components.hours if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_minutes(self): + return type(self)( + pa.array( + [ + td.components.minutes if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_seconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() + ) + ) + + @property + def _dt_milliseconds(self): + return type(self)( + pa.array( + [ + td.components.milliseconds if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_microseconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().microseconds, + from_pandas=True, + type=pa.int32(), + ) + ) + + @property + def _dt_nanoseconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() + ) + ) + + def _dt_to_pytimedelta(self): + data = self._pa_array.to_pylist() + if self._dtype.pyarrow_dtype.unit == "ns": + data = [None if ts is None else ts.to_pytimedelta() for ts in data] + return np.array(data, dtype=object) + + def _dt_total_seconds(self): + return type(self)( + pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True) + ) + + def _dt_as_unit(self, unit: str): + if pa.types.is_date(self.dtype.pyarrow_dtype): + raise NotImplementedError("as_unit not implemented for date types") + pd_array = self._maybe_convert_datelike_array() + # Don't just cast _pa_array in order to follow pandas unit conversion rules + return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True)) + @property def _dt_year(self): return type(self)(pc.year(self._pa_array)) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 929c7f4a63f8f..7e3ba4089ff60 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -148,6 +148,20 @@ def _delegate_method(self, name: str, *args, **kwargs): return result +@delegate_names( + delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_ops, + typ="property", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) +@delegate_names( + delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_methods, + typ="method", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) @delegate_names( delegate=ArrowExtensionArray, accessors=DatetimeArray._datetimelike_ops, @@ -213,6 +227,9 @@ def _delegate_method(self, name: str, *args, **kwargs): return result + def to_pytimedelta(self): + return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta() + def to_pydatetime(self): # GH#20306 warnings.warn( @@ -241,6 +258,26 @@ def isocalendar(self) -> DataFrame: ) return iso_calendar_df + @property + def components(self) -> DataFrame: + from pandas import DataFrame + + components_df = DataFrame( + { + col: getattr(self._parent.array, f"_dt_{col}") + for col in [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] + } + ) + return components_df + @delegate_names( delegate=DatetimeArray, @@ -592,7 +629,7 @@ def __new__(cls, data: Series): # pyright: ignore[reportInconsistentConstructor index=orig.index, ) - if isinstance(data.dtype, ArrowDtype) and data.dtype.kind == "M": + if isinstance(data.dtype, ArrowDtype) and data.dtype.kind in "Mm": return ArrowTemporalProperties(data, orig) if lib.is_np_dtype(data.dtype, "M"): return DatetimeProperties(data, orig) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5624acfb64764..20cdcb9ce9ab8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2723,6 +2723,111 @@ def test_dt_tz_convert(unit): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["timestamp[ms][pyarrow]", "duration[ms][pyarrow]"]) +def test_as_unit(dtype): + # GH 52284 + ser = pd.Series([1000, None], dtype=dtype) + result = ser.dt.as_unit("ns") + expected = ser.astype(dtype.replace("ms", "ns")) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "prop, expected", + [ + ["days", 1], + ["seconds", 2], + ["microseconds", 3], + ["nanoseconds", 4], + ], +) +def test_dt_timedelta_properties(prop, expected): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = getattr(ser.dt, prop) + expected = pd.Series( + ArrowExtensionArray(pa.array([expected, None], type=pa.int32())) + ) + tm.assert_series_equal(result, expected) + + +def test_dt_timedelta_total_seconds(): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.total_seconds() + expected = pd.Series( + ArrowExtensionArray(pa.array([86402.000003, None], type=pa.float64())) + ) + tm.assert_series_equal(result, expected) + + +def test_dt_to_pytimedelta(): + # GH 52284 + data = [timedelta(1, 2, 3), timedelta(1, 2, 4)] + ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns"))) + + result = ser.dt.to_pytimedelta() + expected = np.array(data, dtype=object) + tm.assert_numpy_array_equal(result, expected) + assert all(type(res) is timedelta for res in result) + + expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta() + tm.assert_numpy_array_equal(result, expected) + + +def test_dt_components(): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.components + expected = pd.DataFrame( + [[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]], + columns=[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], + dtype="int32[pyarrow]", + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): # GH51624 From f99f4d62c9dcc7633ae328535a5bf9ff9365a6b7 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 17:16:37 +0100 Subject: [PATCH 009/396] Backport PR #56647 on branch 2.2.x (floordiv fix for large values) (#56655) Backport PR #56647: floordiv fix for large values Co-authored-by: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 7 ++++++- pandas/tests/extension/test_arrow.py | 8 ++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index f7e1cc9cbe36d..34c9c142d3870 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -728,6 +728,7 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) +- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) Conversion diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 32a4cadff8270..b1164301e6d79 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -115,7 +115,12 @@ def cast_for_truediv( if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( pa_object.type ): - return arrow_array.cast(pa.float64()) + # https://github.com/apache/arrow/issues/35563 + # Arrow does not allow safe casting large integral values to float64. + # Intentionally not using arrow_array.cast because it could be a scalar + # value in reflected case, and safe=False only added to + # scalar cast in pyarrow 13. + return pc.cast(arrow_array, pa.float64(), safe=False) return arrow_array def floordiv_compat( diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 20cdcb9ce9ab8..ed1b7b199a16f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3238,6 +3238,14 @@ def test_arrow_floordiv(): tm.assert_series_equal(result, expected) +def test_arrow_floordiv_large_values(): + # GH 55561 + a = pd.Series([1425801600000000000], dtype="int64[pyarrow]") + expected = pd.Series([1425801600000], dtype="int64[pyarrow]") + result = a // 1_000_000 + tm.assert_series_equal(result, expected) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] From 80ba45002e7f16cad9af3e6d9f6be499f445bb05 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 19:52:43 +0100 Subject: [PATCH 010/396] Backport PR #56613 on branch 2.2.x (BUG: Added raising when merging datetime columns with timedelta columns) (#56658) Backport PR #56613: BUG: Added raising when merging datetime columns with timedelta columns Co-authored-by: Huanghz2001 <83120995+Huanghz2001@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/merge.py | 5 +++++ pandas/tests/reshape/merge/test_merge.py | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 34c9c142d3870..d60dbefd83195 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -824,6 +824,7 @@ Reshaping - Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) - Bug in :func:`merge_asof` raising incorrect error for string dtype (:issue:`56444`) - Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`) +- Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`) - Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 690e3c2700c6c..320e4e33a29fb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1526,6 +1526,11 @@ def _maybe_coerce_merge_keys(self) -> None: ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"): # allows datetime with different resolutions continue + # datetime and timedelta not allowed + elif lk.dtype.kind == "M" and rk.dtype.kind == "m": + raise ValueError(msg) + elif lk.dtype.kind == "m" and rk.dtype.kind == "M": + raise ValueError(msg) elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype): continue diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d7a343ae9f152..ab8d22e567d27 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2988,3 +2988,23 @@ def test_merge_empty_frames_column_order(left_empty, right_empty): elif right_empty: expected.loc[:, ["C", "D"]] = np.nan tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +def test_merge_datetime_and_timedelta(how): + left = DataFrame({"key": Series([1, None], dtype="datetime64[ns]")}) + right = DataFrame({"key": Series([1], dtype="timedelta64[ns]")}) + + msg = ( + f"You are trying to merge on {left['key'].dtype} and {right['key'].dtype} " + "columns for key 'key'. If you wish to proceed you should use pd.concat" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + left.merge(right, on="key", how=how) + + msg = ( + f"You are trying to merge on {right['key'].dtype} and {left['key'].dtype} " + "columns for key 'key'. If you wish to proceed you should use pd.concat" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + right.merge(left, on="key", how=how) From d84425d666d6d1b8107b7a1031a68373f523cc80 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 20:22:58 +0100 Subject: [PATCH 011/396] Backport PR #56635 on branch 2.2.x (CoW: Boolean indexer in MultiIndex raising read-only error) (#56660) Backport PR #56635: CoW: Boolean indexer in MultiIndex raising read-only error Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/indexes/multi.py | 2 ++ pandas/tests/copy_view/test_indexing.py | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d60dbefd83195..263da59e6455c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -761,6 +761,7 @@ Interval Indexing ^^^^^^^^ +- Bug in :meth:`DataFrame.loc` mutating a boolean indexer when :class:`DataFrame` has a :class:`MultiIndex` (:issue:`56635`) - Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`) - Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`) - Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2a4e027e2b806..02a841a2075fd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3488,6 +3488,8 @@ def _to_bool_indexer(indexer) -> npt.NDArray[np.bool_]: "is not the same length as the index" ) lvl_indexer = np.asarray(k) + if indexer is None: + lvl_indexer = lvl_indexer.copy() elif is_list_like(k): # a collection of labels to include from this level (these are or'd) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 6f3850ab64daa..2681c07f01990 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1224,6 +1224,27 @@ def test_series_midx_tuples_slice(using_copy_on_write, warn_copy_on_write): tm.assert_series_equal(ser, expected) +def test_midx_read_only_bool_indexer(): + # GH#56635 + def mklbl(prefix, n): + return [f"{prefix}{i}" for i in range(n)] + + idx = pd.MultiIndex.from_product( + [mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)] + ) + cols = pd.MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"] + ) + df = DataFrame(1, index=idx, columns=cols).sort_index().sort_index(axis=1) + + mask = df[("a", "foo")] == 1 + expected_mask = mask.copy() + result = df.loc[pd.IndexSlice[mask, :, ["C1", "C3"]], :] + expected = df.loc[pd.IndexSlice[:, :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + tm.assert_series_equal(mask, expected_mask) + + def test_loc_enlarging_with_dataframe(using_copy_on_write): df = DataFrame({"a": [1, 2, 3]}) rhs = DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]}) From 5043600e694cde9db4d26dd2bec8f2d7a75cdb44 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 20:23:31 +0100 Subject: [PATCH 012/396] Backport PR #56641 on branch 2.2.x (DOC: Add optional dependencies table in 2.2 whatsnew) (#56662) Backport PR #56641: DOC: Add optional dependencies table in 2.2 whatsnew Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 66 +++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 263da59e6455c..1da18cd9be8f9 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -417,15 +417,63 @@ Backwards incompatible API changes Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -For `optional libraries `_ the general recommendation is to use the latest version. -The following table lists the lowest version per library that is currently being tested throughout the development of pandas. -Optional libraries below the lowest tested version may still work, but are not considered supported. - -+-----------------+-----------------+---------+ -| Package | Minimum Version | Changed | -+=================+=================+=========+ -| mypy (dev) | 1.8.0 | X | -+-----------------+-----------------+---------+ +For `optional dependencies `_ the general recommendation is to use the latest version. +Optional dependencies below the lowest tested version may still work but are not considered supported. +The following table lists the optional dependencies that have had their minimum tested version increased. + ++-----------------+---------------------+ +| Package | New Minimum Version | ++=================+=====================+ +| beautifulsoup4 | 4.11.2 | ++-----------------+---------------------+ +| blosc | 1.21.3 | ++-----------------+---------------------+ +| bottleneck | 1.3.6 | ++-----------------+---------------------+ +| fastparquet | 2022.12.0 | ++-----------------+---------------------+ +| fsspec | 2022.11.0 | ++-----------------+---------------------+ +| gcsfs | 2022.11.0 | ++-----------------+---------------------+ +| lxml | 4.9.2 | ++-----------------+---------------------+ +| matplotlib | 3.6.3 | ++-----------------+---------------------+ +| numba | 0.56.4 | ++-----------------+---------------------+ +| numexpr | 2.8.4 | ++-----------------+---------------------+ +| qtpy | 2.3.0 | ++-----------------+---------------------+ +| openpyxl | 3.1.0 | ++-----------------+---------------------+ +| psycopg2 | 2.9.6 | ++-----------------+---------------------+ +| pyreadstat | 1.2.0 | ++-----------------+---------------------+ +| pytables | 3.8.0 | ++-----------------+---------------------+ +| pyxlsb | 1.0.10 | ++-----------------+---------------------+ +| s3fs | 2022.11.0 | ++-----------------+---------------------+ +| scipy | 1.10.0 | ++-----------------+---------------------+ +| sqlalchemy | 2.0.0 | ++-----------------+---------------------+ +| tabulate | 0.9.0 | ++-----------------+---------------------+ +| xarray | 2022.12.0 | ++-----------------+---------------------+ +| xlsxwriter | 3.0.5 | ++-----------------+---------------------+ +| zstandard | 0.19.0 | ++-----------------+---------------------+ +| pyqt5 | 5.15.8 | ++-----------------+---------------------+ +| tzdata | 2022.7 | ++-----------------+---------------------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. From 722d337e91e927ab882abe24c550c0845d093357 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 22:44:56 +0100 Subject: [PATCH 013/396] Backport PR #56370 on branch 2.2.x (BUG: rolling with datetime ArrowDtype) (#56665) Backport PR #56370: BUG: rolling with datetime ArrowDtype Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/datetimelike.py | 7 +++++- pandas/core/window/rolling.py | 23 +++++++++++-------- pandas/tests/window/test_timeseries_window.py | 16 +++++++++++++ 4 files changed, 37 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 1da18cd9be8f9..129f5cedb86c2 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -865,6 +865,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) +- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where either the ``index`` or ``on`` column was :class:`ArrowDtype` with ``pyarrow.timestamp`` type (:issue:`55849`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 11a0c7bf18fcb..a0e0a1434e871 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -92,6 +92,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, DatetimeTZDtype, ExtensionDtype, @@ -2531,7 +2532,7 @@ def _validate_inferred_freq( return freq -def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: +def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype | ArrowDtype) -> str: """ Return the unit str corresponding to the dtype's resolution. @@ -2546,4 +2547,8 @@ def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: """ if isinstance(dtype, DatetimeTZDtype): return dtype.unit + elif isinstance(dtype, ArrowDtype): + if dtype.kind not in "mM": + raise ValueError(f"{dtype=} does not have a resolution.") + return dtype.pyarrow_dtype.unit return np.datetime_data(dtype)[0] diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e78bd258c11ff..68cec16ec9eca 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -14,7 +14,6 @@ Any, Callable, Literal, - cast, ) import numpy as np @@ -39,6 +38,7 @@ is_numeric_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -104,6 +104,7 @@ NDFrameT, QuantileInterpolation, WindowingRankType, + npt, ) from pandas import ( @@ -404,11 +405,12 @@ def _insert_on_column(self, result: DataFrame, obj: DataFrame) -> None: result[name] = extra_col @property - def _index_array(self): + def _index_array(self) -> npt.NDArray[np.int64] | None: # TODO: why do we get here with e.g. MultiIndex? - if needs_i8_conversion(self._on.dtype): - idx = cast("PeriodIndex | DatetimeIndex | TimedeltaIndex", self._on) - return idx.asi8 + if isinstance(self._on, (PeriodIndex, DatetimeIndex, TimedeltaIndex)): + return self._on.asi8 + elif isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM": + return self._on.to_numpy(dtype=np.int64) return None def _resolve_output(self, out: DataFrame, obj: DataFrame) -> DataFrame: @@ -439,7 +441,7 @@ def _apply_series( self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None ) -> Series: """ - Series version of _apply_blockwise + Series version of _apply_columnwise """ obj = self._create_data(self._selected_obj) @@ -455,7 +457,7 @@ def _apply_series( index = self._slice_axis_for_step(obj.index, result) return obj._constructor(result, index=index, name=obj.name) - def _apply_blockwise( + def _apply_columnwise( self, homogeneous_func: Callable[..., ArrayLike], name: str, @@ -614,7 +616,7 @@ def calc(x): return result if self.method == "single": - return self._apply_blockwise(homogeneous_func, name, numeric_only) + return self._apply_columnwise(homogeneous_func, name, numeric_only) else: return self._apply_tablewise(homogeneous_func, name, numeric_only) @@ -1232,7 +1234,9 @@ def calc(x): return result - return self._apply_blockwise(homogeneous_func, name, numeric_only)[:: self.step] + return self._apply_columnwise(homogeneous_func, name, numeric_only)[ + :: self.step + ] @doc( _shared_docs["aggregate"], @@ -1868,6 +1872,7 @@ def _validate(self): if ( self.obj.empty or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex)) + or (isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM") ) and isinstance(self.window, (str, BaseOffset, timedelta)): self._validate_datetimelike_monotonic() diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index c99fc8a8eb60f..bd0fadeb3e475 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -1,9 +1,12 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, DatetimeIndex, + Index, MultiIndex, NaT, Series, @@ -697,3 +700,16 @@ def test_nat_axis_error(msg, axis): with pytest.raises(ValueError, match=f"{msg} values must not have NaT"): with tm.assert_produces_warning(FutureWarning, match=warn_msg): df.rolling("D", axis=axis).mean() + + +@td.skip_if_no("pyarrow") +def test_arrow_datetime_axis(): + # GH 55849 + expected = Series( + np.arange(5, dtype=np.float64), + index=Index( + date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]" + ), + ) + result = expected.rolling("1D").sum() + tm.assert_series_equal(result, expected) From 944c40f381996ba5c64b582200a4fd168341254c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 23:17:14 +0100 Subject: [PATCH 014/396] Backport PR #56654 on branch 2.2.x (BUG: assert_series_equal not properly respecting check-dtype) (#56668) Backport PR #56654: BUG: assert_series_equal not properly respecting check-dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_testing/asserters.py | 10 ++++++++-- pandas/tests/extension/test_numpy.py | 10 ---------- pandas/tests/util/test_assert_frame_equal.py | 10 ++-------- pandas/tests/util/test_assert_series_equal.py | 16 +++++++++++----- 4 files changed, 21 insertions(+), 25 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 800b03707540f..d0f38c85868d4 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -949,9 +949,15 @@ def assert_series_equal( obj=str(obj), ) else: + # convert both to NumPy if not, check_dtype would raise earlier + lv, rv = left_values, right_values + if isinstance(left_values, ExtensionArray): + lv = left_values.to_numpy() + if isinstance(right_values, ExtensionArray): + rv = right_values.to_numpy() assert_numpy_array_equal( - left_values, - right_values, + lv, + rv, check_dtype=check_dtype, obj=str(obj), index_values=left.index, diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index aaf49f53ba02b..e38144f4c615b 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -421,16 +421,6 @@ def test_index_from_listlike_with_dtype(self, data): def test_EA_types(self, engine, data, request): super().test_EA_types(engine, data, request) - @pytest.mark.xfail(reason="Expect NumpyEA, get np.ndarray") - def test_compare_array(self, data, comparison_op): - super().test_compare_array(data, comparison_op) - - def test_compare_scalar(self, data, comparison_op, request): - if data.dtype.kind == "f" or comparison_op.__name__ in ["eq", "ne"]: - mark = pytest.mark.xfail(reason="Expect NumpyEA, get np.ndarray") - request.applymarker(mark) - super().test_compare_scalar(data, comparison_op) - class Test2DCompat(base.NDArrayBacked2DTests): pass diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index a074898f6046d..79132591b15b3 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -211,10 +211,7 @@ def test_assert_frame_equal_extension_dtype_mismatch(): "\\[right\\]: int[32|64]" ) - # TODO: this shouldn't raise (or should raise a better error message) - # https://github.com/pandas-dev/pandas/issues/56131 - with pytest.raises(AssertionError, match="classes are different"): - tm.assert_frame_equal(left, right, check_dtype=False) + tm.assert_frame_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) @@ -246,7 +243,6 @@ def test_assert_frame_equal_ignore_extension_dtype_mismatch(): tm.assert_frame_equal(left, right, check_dtype=False) -@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") def test_assert_frame_equal_ignore_extension_dtype_mismatch_cross_class(): # https://github.com/pandas-dev/pandas/issues/35715 left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") @@ -300,9 +296,7 @@ def test_frame_equal_mixed_dtypes(frame_or_series, any_numeric_ea_dtype, indexer dtypes = (any_numeric_ea_dtype, "int64") obj1 = frame_or_series([1, 2], dtype=dtypes[indexer[0]]) obj2 = frame_or_series([1, 2], dtype=dtypes[indexer[1]]) - msg = r'(Series|DataFrame.iloc\[:, 0\] \(column name="0"\) classes) are different' - with pytest.raises(AssertionError, match=msg): - tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False) + tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False) def test_assert_frame_equal_check_like_different_indexes(): diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index f722f619bc456..c4ffc197298f0 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -290,10 +290,7 @@ def test_assert_series_equal_extension_dtype_mismatch(): \\[left\\]: Int64 \\[right\\]: int[32|64]""" - # TODO: this shouldn't raise (or should raise a better error message) - # https://github.com/pandas-dev/pandas/issues/56131 - with pytest.raises(AssertionError, match="Series classes are different"): - tm.assert_series_equal(left, right, check_dtype=False) + tm.assert_series_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_series_equal(left, right, check_dtype=True) @@ -372,7 +369,6 @@ def test_assert_series_equal_ignore_extension_dtype_mismatch(): tm.assert_series_equal(left, right, check_dtype=False) -@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") def test_assert_series_equal_ignore_extension_dtype_mismatch_cross_class(): # https://github.com/pandas-dev/pandas/issues/35715 left = Series([1, 2, 3], dtype="Int64") @@ -456,3 +452,13 @@ def test_large_unequal_ints(dtype): right = Series([1577840521123543], dtype=dtype) with pytest.raises(AssertionError, match="Series are different"): tm.assert_series_equal(left, right) + + +@pytest.mark.parametrize("dtype", [None, object]) +@pytest.mark.parametrize("check_exact", [True, False]) +@pytest.mark.parametrize("val", [3, 3.5]) +def test_ea_and_numpy_no_dtype_check(val, check_exact, dtype): + # GH#56651 + left = Series([1, 2, val], dtype=dtype) + right = Series(pd.array([1, 2, val])) + tm.assert_series_equal(left, right, check_dtype=False, check_exact=check_exact) From 903d1520ff21a715d6ee17f2d5692f32afcc7739 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Dec 2023 23:43:47 +0100 Subject: [PATCH 015/396] Backport PR #56664 on branch 2.2.x (CI: Run jobs on 2.2.x branch) (#56669) Backport PR #56664: CI: Run jobs on 2.2.x branch Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/code-checks.yml | 4 ++-- .github/workflows/docbuild-and-upload.yml | 4 ++-- .github/workflows/package-checks.yml | 4 ++-- .github/workflows/unit-tests.yml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index b49b9a67c4743..8e29d56f47dcf 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.2.x pull_request: branches: - main - - 2.1.x + - 2.2.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index da232404e6ff5..73acd9acc129a 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -4,13 +4,13 @@ on: push: branches: - main - - 2.1.x + - 2.2.x tags: - '*' pull_request: branches: - main - - 2.1.x + - 2.2.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 04d8b8e006985..d59ddf272f705 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.2.x pull_request: branches: - main - - 2.1.x + - 2.2.x types: [ labeled, opened, synchronize, reopened ] permissions: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 6ca4d19196874..12e645dc9da81 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.1.x + - 2.2.x pull_request: branches: - main - - 2.1.x + - 2.2.x paths-ignore: - "doc/**" - "web/**" From 80ad64f05f44cc2ca3183fd6c63b8e3fa44dc486 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 29 Dec 2023 23:58:59 +0100 Subject: [PATCH 016/396] Backport PR #56666 on branch 2.2.x (STY: Use ruff instead of pygrep check for future annotation import) (#56683) Backport PR #56666: STY: Use ruff instead of pygrep check for future annotation import Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .pre-commit-config.yaml | 12 ------------ pyproject.toml | 2 ++ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7f3fc95ce00cc..4b02ad7cf886f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -358,18 +358,6 @@ repos: files: ^pandas/ exclude: ^(pandas/_libs/|pandas/tests/|pandas/errors/__init__.py$|pandas/_version.py) types: [python] - - id: future-annotations - name: import annotations from __future__ - entry: 'from __future__ import annotations' - language: pygrep - args: [--negate] - files: ^pandas/ - types: [python] - exclude: | - (?x) - /(__init__\.py)|(api\.py)|(_version\.py)|(testing\.py)|(conftest\.py)$ - |/tests/ - |/_testing/ - id: check-test-naming name: check that test names start with 'test' entry: python -m scripts.check_test_naming diff --git a/pyproject.toml b/pyproject.toml index 5e65edf81f9c7..8724a25909543 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -259,6 +259,8 @@ select = [ "FLY", # flake8-logging-format "G", + # flake8-future-annotations + "FA", ] ignore = [ From 8c990dff7e48386c27cc9dd89b31831119ca4b62 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 30 Dec 2023 00:47:36 +0100 Subject: [PATCH 017/396] Backport PR #56682 on branch 2.2.x (CLN: NEP 50 followups) (#56684) Backport PR #56682: CLN: NEP 50 followups Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311-sanitizers.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-312.yaml | 2 +- ci/deps/actions-39-minimum_versions.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- pandas/core/dtypes/cast.py | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 12e645dc9da81..dd5d090e098b0 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -92,7 +92,7 @@ jobs: - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" - test_args: "-W error::FutureWarning" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 4b62ecc79e4ef..45f114322015b 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 95c0319d6f5b8..d6bf9ec7843de 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -21,7 +21,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 5455b9b84b034..d84063ac2a9ba 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -18,7 +18,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz - pip diff --git a/ci/deps/actions-311-sanitizers.yaml b/ci/deps/actions-311-sanitizers.yaml index dcd381066b0ea..f5f04c90bffad 100644 --- a/ci/deps/actions-311-sanitizers.yaml +++ b/ci/deps/actions-311-sanitizers.yaml @@ -22,7 +22,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # pandas dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 52074ae00ea18..d14686696e669 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 4c51e9e6029e3..86aaf24b4e15c 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index fd71315d2e7ac..7067048c4434d 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -22,7 +22,7 @@ dependencies: # required dependencies - python-dateutil=2.8.2 - - numpy=1.22.4, <2 + - numpy=1.22.4 - pytz=2020.1 # optional dependencies diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index cbe8f77c15730..31ee74174cd46 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 5a5a01f7aec72..d9c8dd81b7c33 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -20,7 +20,7 @@ dependencies: - hypothesis>=6.46.1 # required - - numpy<2 + - numpy - python-dateutil - pytz - pip: diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 8e106445cd4e0..a19ffd485262d 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7a088bf84c48e..259e83a5936d7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1332,7 +1332,7 @@ def find_result_type(left_dtype: DtypeObj, right: Any) -> DtypeObj: right = left_dtype elif ( not np.issubdtype(left_dtype, np.unsignedinteger) - and 0 < right <= 2 ** (8 * right_dtype.itemsize - 1) - 1 + and 0 < right <= np.iinfo(right_dtype).max ): # If left dtype isn't unsigned, check if it fits in the signed dtype right = np.dtype(f"i{right_dtype.itemsize}") From 0d0c79222e00f30614f1c94d2016039f5215dd65 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 30 Dec 2023 20:05:18 +0100 Subject: [PATCH 018/396] Backport PR #56312 on branch 2.2.x (DOC: Add whatsnew for concat regression) (#56686) Backport PR #56312: DOC: Add whatsnew for concat regression Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 129f5cedb86c2..649ad37a56b35 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -761,6 +761,7 @@ Datetimelike - Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) - Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) - Bug in the results of :func:`to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) +- Fixed regression where :func:`concat` would raise an error when concatenating ``datetime64`` columns with differing resolutions (:issue:`53641`) Timedelta ^^^^^^^^^ From ee4c37723ce62e21717ea3fb584dc9a1b004de5b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 2 Jan 2024 21:39:07 +0100 Subject: [PATCH 019/396] Backport PR #56167 on branch 2.2.x ([ENH]: Expand types allowed in Series.struct.field) (#56698) Backport PR #56167: [ENH]: Expand types allowed in Series.struct.field Co-authored-by: Tom Augspurger --- doc/source/whatsnew/v2.2.0.rst | 8 ++ pandas/core/arrays/arrow/accessors.py | 125 +++++++++++++++--- .../series/accessors/test_struct_accessor.py | 48 ++++++- 3 files changed, 165 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 649ad37a56b35..15e98cbb2a4d7 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -251,6 +251,14 @@ DataFrame. (:issue:`54938`) ) series.struct.explode() +Use :meth:`Series.struct.field` to index into a (possible nested) +struct field. + + +.. ipython:: python + + series.struct.field("project") + .. _whatsnew_220.enhancements.list_accessor: Series.list accessor for PyArrow list data diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 7f88267943526..124f8fb6ad8bc 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -6,13 +6,18 @@ ABCMeta, abstractmethod, ) -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + cast, +) from pandas.compat import ( pa_version_under10p1, pa_version_under11p0, ) +from pandas.core.dtypes.common import is_list_like + if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc @@ -267,15 +272,27 @@ def dtypes(self) -> Series: names = [struct.name for struct in pa_type] return Series(types, index=Index(names)) - def field(self, name_or_index: str | int) -> Series: + def field( + self, + name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + ) -> Series: """ Extract a child field of a struct as a Series. Parameters ---------- - name_or_index : str | int + name_or_index : str | bytes | int | expression | list Name or index of the child field to extract. + For list-like inputs, this will index into a nested + struct. + Returns ------- pandas.Series @@ -285,6 +302,19 @@ def field(self, name_or_index: str | int) -> Series: -------- Series.struct.explode : Return all child fields as a DataFrame. + Notes + ----- + The name of the resulting Series will be set using the following + rules: + + - For string, bytes, or integer `name_or_index` (or a list of these, for + a nested selection), the Series name is set to the selected + field's name. + - For a :class:`pyarrow.compute.Expression`, this is set to + the string form of the expression. + - For list-like `name_or_index`, the name will be set to the + name of the final field selected. + Examples -------- >>> import pyarrow as pa @@ -314,27 +344,92 @@ def field(self, name_or_index: str | int) -> Series: 1 2 2 1 Name: version, dtype: int64[pyarrow] + + Or an expression + + >>> import pyarrow.compute as pc + >>> s.struct.field(pc.field("project")) + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + For nested struct types, you can pass a list of values to index + multiple levels: + + >>> version_type = pa.struct([ + ... ("major", pa.int64()), + ... ("minor", pa.int64()), + ... ]) + >>> s = pd.Series( + ... [ + ... {"version": {"major": 1, "minor": 5}, "project": "pandas"}, + ... {"version": {"major": 2, "minor": 1}, "project": "pandas"}, + ... {"version": {"major": 1, "minor": 26}, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", version_type), ("project", pa.string())] + ... )) + ... ) + >>> s.struct.field(["version", "minor"]) + 0 5 + 1 1 + 2 26 + Name: minor, dtype: int64[pyarrow] + >>> s.struct.field([0, 0]) + 0 1 + 1 2 + 2 1 + Name: major, dtype: int64[pyarrow] """ from pandas import Series + def get_name( + level_name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + data: pa.ChunkedArray, + ): + if isinstance(level_name_or_index, int): + name = data.type.field(level_name_or_index).name + elif isinstance(level_name_or_index, (str, bytes)): + name = level_name_or_index + elif isinstance(level_name_or_index, pc.Expression): + name = str(level_name_or_index) + elif is_list_like(level_name_or_index): + # For nested input like [2, 1, 2] + # iteratively get the struct and field name. The last + # one is used for the name of the index. + level_name_or_index = list(reversed(level_name_or_index)) + selected = data + while level_name_or_index: + # we need the cast, otherwise mypy complains about + # getting ints, bytes, or str here, which isn't possible. + level_name_or_index = cast(list, level_name_or_index) + name_or_index = level_name_or_index.pop() + name = get_name(name_or_index, selected) + selected = selected.type.field(selected.type.get_field_index(name)) + name = selected.name + else: + raise ValueError( + "name_or_index must be an int, str, bytes, " + "pyarrow.compute.Expression, or list of those" + ) + return name + pa_arr = self._data.array._pa_array - if isinstance(name_or_index, int): - index = name_or_index - elif isinstance(name_or_index, str): - index = pa_arr.type.get_field_index(name_or_index) - else: - raise ValueError( - "name_or_index must be an int or str, " - f"got {type(name_or_index).__name__}" - ) + name = get_name(name_or_index, pa_arr) + field_arr = pc.struct_field(pa_arr, name_or_index) - pa_field = pa_arr.type[index] - field_arr = pc.struct_field(pa_arr, [index]) return Series( field_arr, dtype=ArrowDtype(field_arr.type), index=self._data.index, - name=pa_field.name, + name=name, ) def explode(self) -> DataFrame: diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py index 1ec5b3b726d17..80aea75fda406 100644 --- a/pandas/tests/series/accessors/test_struct_accessor.py +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -2,6 +2,11 @@ import pytest +from pandas.compat.pyarrow import ( + pa_version_under11p0, + pa_version_under13p0, +) + from pandas import ( ArrowDtype, DataFrame, @@ -11,6 +16,7 @@ import pandas._testing as tm pa = pytest.importorskip("pyarrow") +pc = pytest.importorskip("pyarrow.compute") def test_struct_accessor_dtypes(): @@ -53,6 +59,7 @@ def test_struct_accessor_dtypes(): tm.assert_series_equal(actual, expected) +@pytest.mark.skipif(pa_version_under13p0, reason="pyarrow>=13.0.0 required") def test_struct_accessor_field(): index = Index([-100, 42, 123]) ser = Series( @@ -94,10 +101,11 @@ def test_struct_accessor_field(): def test_struct_accessor_field_with_invalid_name_or_index(): ser = Series([], dtype=ArrowDtype(pa.struct([("field", pa.int64())]))) - with pytest.raises(ValueError, match="name_or_index must be an int or str"): + with pytest.raises(ValueError, match="name_or_index must be an int, str,"): ser.struct.field(1.1) +@pytest.mark.skipif(pa_version_under11p0, reason="pyarrow>=11.0.0 required") def test_struct_accessor_explode(): index = Index([-100, 42, 123]) ser = Series( @@ -148,3 +156,41 @@ def test_struct_accessor_api_for_invalid(invalid): ), ): invalid.struct + + +@pytest.mark.parametrize( + ["indices", "name"], + [ + (0, "int_col"), + ([1, 2], "str_col"), + (pc.field("int_col"), "int_col"), + ("int_col", "int_col"), + (b"string_col", b"string_col"), + ([b"string_col"], "string_col"), + ], +) +@pytest.mark.skipif(pa_version_under13p0, reason="pyarrow>=13.0.0 required") +def test_struct_accessor_field_expanded(indices, name): + arrow_type = pa.struct( + [ + ("int_col", pa.int64()), + ( + "struct_col", + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ("str_col", pa.string()), + ] + ), + ), + (b"string_col", pa.string()), + ] + ) + + data = pa.array([], type=arrow_type) + ser = Series(data, dtype=ArrowDtype(arrow_type)) + expected = pc.struct_field(data, indices) + result = ser.struct.field(indices) + tm.assert_equal(result.array._pa_array.combine_chunks(), expected) + assert result.name == name From d43af631d6fcb6d8b6a90594b82737cfee63724b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Jan 2024 11:38:27 -1000 Subject: [PATCH 020/396] Backport PR #56691 on branch 2.2.x (Bug pyarrow implementation of str.fullmatch matches partial string. issue #56652) (#56715) Backport PR #56691: Bug pyarrow implementation of str.fullmatch matches partial string. issue #56652 Co-authored-by: JackCollins91 <112877841+JackCollins91@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/extension/test_arrow.py | 19 ++++++++++++------- pandas/tests/strings/test_find_replace.py | 9 +++++++++ 5 files changed, 24 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 15e98cbb2a4d7..043646457f604 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -805,6 +805,7 @@ Strings - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) +- Bug in :meth:`str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`) - Bug in comparison operations for ``dtype="string[pyarrow_numpy]"`` raising if dtypes can't be compared (:issue:`56008`) Interval diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b1164301e6d79..5427cee55dfb1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2277,7 +2277,7 @@ def _str_match( def _str_fullmatch( self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None ): - if not pat.endswith("$") or pat.endswith("//$"): + if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" return self._str_match(pat, case, flags, na) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d5a76811a12e6..e8f614ff855c0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -433,7 +433,7 @@ def _str_match( def _str_fullmatch( self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None ): - if not pat.endswith("$") or pat.endswith("//$"): + if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" return self._str_match(pat, case, flags, na) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ed1b7b199a16f..e709e6fcfe456 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1903,16 +1903,21 @@ def test_str_match(pat, case, na, exp): @pytest.mark.parametrize( "pat, case, na, exp", [ - ["abc", False, None, [True, None]], - ["Abc", True, None, [False, None]], - ["bc", True, None, [False, None]], - ["ab", False, True, [True, True]], - ["a[a-z]{2}", False, None, [True, None]], - ["A[a-z]{1}", True, None, [False, None]], + ["abc", False, None, [True, True, False, None]], + ["Abc", True, None, [False, False, False, None]], + ["bc", True, None, [False, False, False, None]], + ["ab", False, None, [True, True, False, None]], + ["a[a-z]{2}", False, None, [True, True, False, None]], + ["A[a-z]{1}", True, None, [False, False, False, None]], + # GH Issue: #56652 + ["abc$", False, None, [True, False, False, None]], + ["abc\\$", False, None, [False, True, False, None]], + ["Abc$", True, None, [False, False, False, None]], + ["Abc\\$", True, None, [False, False, False, None]], ], ) def test_str_fullmatch(pat, case, na, exp): - ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) result = ser.str.match(pat, case=case, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 3f58c6d703f8f..cd4707ac405de 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -730,6 +730,15 @@ def test_fullmatch(any_string_dtype): tm.assert_series_equal(result, expected) +def test_fullmatch_dollar_literal(any_string_dtype): + # GH 56652 + ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) + result = ser.str.fullmatch("foo\\$") + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected = Series([False, False, np.nan, True], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + def test_fullmatch_na_kwarg(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype From 341939a3d5432dced806a72151bf8c2f1f336f5a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 3 Jan 2024 23:16:55 +0100 Subject: [PATCH 021/396] Backport PR #56699 on branch 2.2.x (DOC: Corrected typo in warning on coerce) (#56719) Backport PR #56699: DOC: Corrected typo in warning on coerce Co-authored-by: aaron-robeson-8451 <65349876+aaron-robeson-8451@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/internals/blocks.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 51b4c4f297b07..d4eb5742ef928 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -432,7 +432,7 @@ In a future version, these will raise an error and you should cast to a common d In [3]: ser[0] = 'not an int64' FutureWarning: - Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. + Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'not an int64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first. In [4]: ser diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 20eff9315bc80..b7af545bd523e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -512,7 +512,7 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: if warn_on_upcast: warnings.warn( f"Setting an item of incompatible dtype is deprecated " - "and will raise in a future error of pandas. " + "and will raise an error in a future version of pandas. " f"Value '{other}' has dtype incompatible with {self.values.dtype}, " "please explicitly cast to a compatible dtype first.", FutureWarning, From 24ce4e10a4f9ac1bc175edf6762f982903855ef5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 Jan 2024 00:31:08 +0100 Subject: [PATCH 022/396] Backport PR #56616 on branch 2.2.x (BUG: Add limit_area to EA ffill/bfill) (#56720) Backport PR #56616: BUG: Add limit_area to EA ffill/bfill Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/arrays/_mixins.py | 9 +- pandas/core/arrays/arrow/array.py | 13 +- pandas/core/arrays/base.py | 16 ++- pandas/core/arrays/interval.py | 11 +- pandas/core/arrays/masked.py | 21 +++- pandas/core/arrays/period.py | 11 +- pandas/core/arrays/sparse/array.py | 11 +- pandas/core/internals/blocks.py | 15 ++- pandas/core/missing.py | 117 +++++++++++++----- pandas/tests/extension/base/missing.py | 22 ++++ .../tests/extension/decimal/test_decimal.py | 30 +++++ pandas/tests/extension/json/array.py | 4 + pandas/tests/extension/json/test_json.py | 23 ++++ pandas/tests/frame/methods/test_fillna.py | 40 ++---- scripts/validate_unwanted_patterns.py | 1 + 16 files changed, 266 insertions(+), 80 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 043646457f604..75ba7c9f72c1b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -321,7 +321,7 @@ Other enhancements - :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) -- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`) +- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 9ece12cf51a7b..0da121c36644a 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -305,7 +305,12 @@ def _fill_mask_inplace( func(self._ndarray.T, limit=limit, mask=mask.T) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: mask = self.isna() if mask.any(): @@ -315,7 +320,7 @@ def _pad_or_backfill( npvalues = self._ndarray.T if copy: npvalues = npvalues.copy() - func(npvalues, limit=limit, mask=mask.T) + func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T) npvalues = npvalues.T if copy: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5427cee55dfb1..0bc01d2da330a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1005,13 +1005,18 @@ def dropna(self) -> Self: return type(self)(pc.drop_null(self._pa_array)) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: if not self._hasna: # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() - if limit is None: + if limit is None and limit_area is None: method = missing.clean_fill_method(method) try: if method == "pad": @@ -1027,7 +1032,9 @@ def _pad_or_backfill( # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) @doc(ExtensionArray.fillna) def fillna( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 59c6d911cfaef..ea0e2e54e3339 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -70,6 +70,7 @@ unique, ) from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.missing import _fill_limit_area_1d from pandas.core.sorting import ( nargminmax, nargsort, @@ -954,7 +955,12 @@ def interpolate( ) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: """ Pad or backfill values, used by Series/DataFrame ffill and bfill. @@ -1012,6 +1018,12 @@ def _pad_or_backfill( DeprecationWarning, stacklevel=find_stack_level(), ) + if limit_area is not None: + raise NotImplementedError( + f"{type(self).__name__} does not implement limit_area " + "(added in pandas 2.2). 3rd-party ExtnsionArray authors " + "need to add this argument to _pad_or_backfill." + ) return self.fillna(method=method, limit=limit) mask = self.isna() @@ -1021,6 +1033,8 @@ def _pad_or_backfill( meth = missing.clean_fill_method(method) npmask = np.asarray(mask) + if limit_area is not None and not npmask.all(): + _fill_limit_area_1d(npmask, limit_area) if meth == "pad": indexer = libalgos.get_fill_indexer(npmask, limit=limit) return self.take(indexer, allow_fill=True) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index a19b304529383..904c87c68e211 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -890,11 +890,18 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr return obj[indexer] def _pad_or_backfill( # pylint: disable=useless-parent-delegation - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) def fillna( self, value=None, method=None, limit: int | None = None, copy: bool = True diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 03c09c5b2fd18..fc092ef6eb463 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -192,7 +192,12 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._simple_new(self._data[item], newmask) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: mask = self._mask @@ -204,7 +209,21 @@ def _pad_or_backfill( if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() + elif limit_area is not None: + mask = mask.copy() func(npvalues, limit=limit, mask=new_mask) + + if limit_area is not None and not mask.all(): + mask = mask.T + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + new_mask[:first] |= mask[:first] + new_mask[last + 1 :] |= mask[last + 1 :] + elif limit_area == "outside": + new_mask[first + 1 : last] |= mask[first + 1 : last] + if copy: return self._simple_new(npvalues.T, new_mask.T) else: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 2930b979bfe78..28f25d38b2363 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -810,12 +810,19 @@ def searchsorted( return m8arr.searchsorted(npvalue, side=side, sorter=sorter) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # view as dt64 so we get treated as timelike in core.missing, # similar to dtl._period_dispatch dta = self.view("M8[ns]") - result = dta._pad_or_backfill(method=method, limit=limit, copy=copy) + result = dta._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) if copy: return cast("Self", result.view(self.dtype)) else: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 5db77db2a9c66..98d84d899094b 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -716,11 +716,18 @@ def isna(self) -> Self: # type: ignore[override] return type(self)(mask, fill_value=False, dtype=dtype) def _pad_or_backfill( # pylint: disable=useless-parent-delegation - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # TODO(3.0): We can remove this method once deprecation for fillna method # keyword is enforced. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) def fillna( self, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b7af545bd523e..06fd9ebe47eae 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import wraps +import inspect import re from typing import ( TYPE_CHECKING, @@ -2256,11 +2257,21 @@ def pad_or_backfill( ) -> list[Block]: values = self.values + kwargs: dict[str, Any] = {"method": method, "limit": limit} + if "limit_area" in inspect.signature(values._pad_or_backfill).parameters: + kwargs["limit_area"] = limit_area + elif limit_area is not None: + raise NotImplementedError( + f"{type(values).__name__} does not implement limit_area " + "(added in pandas 2.2). 3rd-party ExtnsionArray authors " + "need to add this argument to _pad_or_backfill." + ) + if values.ndim == 2 and axis == 1: # NDArrayBackedExtensionArray.fillna assumes axis=0 - new_values = values.T._pad_or_backfill(method=method, limit=limit).T + new_values = values.T._pad_or_backfill(**kwargs).T else: - new_values = values._pad_or_backfill(method=method, limit=limit) + new_values = values._pad_or_backfill(**kwargs) return [self.make_block_same_class(new_values)] diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d275445983b6f..5dd9aaf5fbb4a 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -3,10 +3,7 @@ """ from __future__ import annotations -from functools import ( - partial, - wraps, -) +from functools import wraps from typing import ( TYPE_CHECKING, Any, @@ -823,6 +820,7 @@ def _interpolate_with_limit_area( values, method=method, limit=limit, + limit_area=limit_area, ) if limit_area == "inside": @@ -863,27 +861,6 @@ def pad_or_backfill_inplace( ----- Modifies values in-place. """ - if limit_area is not None: - np.apply_along_axis( - # error: Argument 1 to "apply_along_axis" has incompatible type - # "partial[None]"; expected - # "Callable[..., Union[_SupportsArray[dtype[]], - # Sequence[_SupportsArray[dtype[]]], - # Sequence[Sequence[_SupportsArray[dtype[]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]], - # Sequence[Sequence[Sequence[Sequence[_ - # SupportsArray[dtype[]]]]]]]]" - partial( # type: ignore[arg-type] - _interpolate_with_limit_area, - method=method, - limit=limit, - limit_area=limit_area, - ), - axis, - values, - ) - return - transf = (lambda x: x) if axis == 0 else (lambda x: x.T) # reshape a 1 dim if needed @@ -897,8 +874,7 @@ def pad_or_backfill_inplace( func = get_fill_func(method, ndim=2) # _pad_2d and _backfill_2d both modify tvalues inplace - func(tvalues, limit=limit) - return + func(tvalues, limit=limit, limit_area=limit_area) def _fillna_prep( @@ -909,7 +885,6 @@ def _fillna_prep( if mask is None: mask = isna(values) - mask = mask.view(np.uint8) return mask @@ -919,16 +894,23 @@ def _datetimelike_compat(func: F) -> F: """ @wraps(func) - def new_func(values, limit: int | None = None, mask=None): + def new_func( + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask=None, + ): if needs_i8_conversion(values.dtype): if mask is None: # This needs to occur before casting to int64 mask = isna(values) - result, mask = func(values.view("i8"), limit=limit, mask=mask) + result, mask = func( + values.view("i8"), limit=limit, limit_area=limit_area, mask=mask + ) return result.view(values.dtype), mask - return func(values, limit=limit, mask=mask) + return func(values, limit=limit, limit_area=limit_area, mask=mask) return cast(F, new_func) @@ -937,9 +919,12 @@ def new_func(values, limit: int | None = None, mask=None): def _pad_1d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) algos.pad_inplace(values, mask, limit=limit) return values, mask @@ -948,9 +933,12 @@ def _pad_1d( def _backfill_1d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) algos.backfill_inplace(values, mask, limit=limit) return values, mask @@ -959,9 +947,12 @@ def _backfill_1d( def _pad_2d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ): mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) if values.size: algos.pad_2d_inplace(values, mask, limit=limit) @@ -973,9 +964,14 @@ def _pad_2d( @_datetimelike_compat def _backfill_2d( - values, limit: int | None = None, mask: npt.NDArray[np.bool_] | None = None + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask: npt.NDArray[np.bool_] | None = None, ): mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) if values.size: algos.backfill_2d_inplace(values, mask, limit=limit) @@ -985,6 +981,63 @@ def _backfill_2d( return values, mask +def _fill_limit_area_1d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 1d mask for ffill/bfill with limit_area. + + Caller is responsible for checking at least one value of mask is False. + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + mask[:first] = False + mask[last + 1 :] = False + elif limit_area == "outside": + mask[first + 1 : last] = False + + +def _fill_limit_area_2d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 2d mask for ffill/bfill with limit_area. + + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask.T + if limit_area == "outside": + # Identify inside + la_mask = ( + np.maximum.accumulate(neg_mask, axis=0) + & np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + else: + # Identify outside + la_mask = ( + ~np.maximum.accumulate(neg_mask, axis=0) + | ~np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + mask[la_mask.T] = False + + _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index ffb7a24b4b390..dbd6682c12123 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -77,6 +77,28 @@ def test_fillna_limit_pad(self, data_missing): expected = pd.Series(data_missing.take([1, 1, 1, 0, 1])) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + arr = data_missing.take(input_ilocs) + result = pd.Series(arr).ffill(limit_area=limit_area) + expected = pd.Series(data_missing.take(expected_ilocs)) + tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( "ignore:Series.fillna with 'method' is deprecated:FutureWarning" ) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index b3c57ee49a724..9907e345ada63 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -156,6 +156,36 @@ def test_fillna_limit_pad(self, data_missing): ): super().test_fillna_limit_pad(data_missing) + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + msg = "ExtensionArray.fillna 'method' keyword is deprecated" + with tm.assert_produces_warning( + DeprecationWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + msg = "DecimalArray does not implement limit_area" + with pytest.raises(NotImplementedError, match=msg): + super().test_ffill_limit_area( + data_missing, limit_area, input_ilocs, expected_ilocs + ) + def test_fillna_limit_backfill(self, data_missing): msg = "Series.fillna with 'method' is deprecated" with tm.assert_produces_warning( diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index d3d9dcc4a4712..31f44f886add7 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -235,6 +235,10 @@ def _values_for_argsort(self): frozen = [tuple(x.items()) for x in self] return construct_1d_object_array_from_listlike(frozen) + def _pad_or_backfill(self, *, method, limit=None, copy=True): + # GH#56616 - test EA method without limit_area argument + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 7686bc5abb44c..a18edac9aef93 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -149,6 +149,29 @@ def test_fillna_frame(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" super().test_fillna_frame() + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + msg = "JSONArray does not implement limit_area" + with pytest.raises(NotImplementedError, match=msg): + super().test_ffill_limit_area( + data_missing, limit_area, input_ilocs, expected_ilocs + ) + @unhashable def test_value_counts(self, all_data, dropna): super().test_value_counts(all_data, dropna) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 6757669351c5c..89c50a8c21e1c 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -862,41 +862,29 @@ def test_pad_backfill_deprecated(func): @pytest.mark.parametrize( "data, expected_data, method, kwargs", ( - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan], "ffill", {"limit_area": "inside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan], "ffill", {"limit_area": "inside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0], "ffill", {"limit_area": "outside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan], "ffill", {"limit_area": "outside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), ( [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], @@ -910,41 +898,29 @@ def test_pad_backfill_deprecated(func): "ffill", {"limit_area": "outside", "limit": 1}, ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan], "bfill", {"limit_area": "inside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan], "bfill", {"limit_area": "inside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], "bfill", {"limit_area": "outside"}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), - pytest.param( + ( [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], "bfill", {"limit_area": "outside", "limit": 1}, - marks=pytest.mark.xfail( - reason="GH#41813 - limit_area applied to the wrong axis" - ), ), ), ) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 89b67ddd9f5b6..0d724779abfda 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -58,6 +58,7 @@ "_iLocIndexer", # TODO(3.0): GH#55043 - remove upon removal of ArrayManager "_get_option", + "_fill_limit_area_1d", } From 1c3c9884bdd29478ee7d93be9a4e6dc5976af075 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 Jan 2024 00:33:36 +0100 Subject: [PATCH 023/396] Backport PR #56721 on branch 2.2.x (DOC: Fixup read_csv docstring) (#56725) Backport PR #56721: DOC: Fixup read_csv docstring Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index a9b41b45aba2f..e26e7e7470461 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -396,7 +396,7 @@ - Callable, function with signature as described in `pyarrow documentation _` when ``engine='pyarrow'`` + #pyarrow.csv.ParseOptions.invalid_row_handler>`_ when ``engine='pyarrow'`` delim_whitespace : bool, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be From 0cd02c56ceeb6666872cf428acc3e07ffd568082 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 Jan 2024 00:48:10 +0100 Subject: [PATCH 024/396] Backport PR #56672 on branch 2.2.x (BUG: dictionary type astype categorical using dictionary as categories) (#56723) Backport PR #56672: BUG: dictionary type astype categorical using dictionary as categories Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/categorical.py | 46 +++++++++++++++++----------- pandas/tests/extension/test_arrow.py | 16 ++++++++++ 3 files changed, 45 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 75ba7c9f72c1b..4222de8ce324f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -740,6 +740,7 @@ Categorical ^^^^^^^^^^^ - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) - Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`) +- Bug when casting ``pa.dictionary`` to :class:`CategoricalDtype` using a ``pa.DictionaryArray`` as categories (:issue:`56672`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 065a942cae768..b87c5375856dc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -44,7 +44,9 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, + CategoricalDtypeType, ExtensionDtype, ) from pandas.core.dtypes.generic import ( @@ -443,24 +445,32 @@ def __init__( values = arr if dtype.categories is None: - if not isinstance(values, ABCIndex): - # in particular RangeIndex xref test_index_equal_range_categories - values = sanitize_array(values, None) - try: - codes, categories = factorize(values, sort=True) - except TypeError as err: - codes, categories = factorize(values, sort=False) - if dtype.ordered: - # raise, as we don't have a sortable data structure and so - # the user should give us one by specifying categories - raise TypeError( - "'values' is not ordered, please " - "explicitly specify the categories order " - "by passing in a categories argument." - ) from err - - # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered) + if isinstance(values.dtype, ArrowDtype) and issubclass( + values.dtype.type, CategoricalDtypeType + ): + arr = values._pa_array.combine_chunks() + categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype) + codes = arr.indices.to_numpy() + dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) + else: + if not isinstance(values, ABCIndex): + # in particular RangeIndex xref test_index_equal_range_categories + values = sanitize_array(values, None) + try: + codes, categories = factorize(values, sort=True) + except TypeError as err: + codes, categories = factorize(values, sort=False) + if dtype.ordered: + # raise, as we don't have a sortable data structure and so + # the user should give us one by specifying categories + raise TypeError( + "'values' is not ordered, please " + "explicitly specify the categories order " + "by passing in a categories argument." + ) from err + + # we're inferring from values + dtype = CategoricalDtype(categories, dtype.ordered) elif isinstance(values.dtype, CategoricalDtype): old_codes = extract_array(values)._codes diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e709e6fcfe456..6689fb34f2ae3 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3234,6 +3234,22 @@ def test_factorize_chunked_dictionary(): tm.assert_index_equal(res_uniques, exp_uniques) +def test_dictionary_astype_categorical(): + # GH#56672 + arrs = [ + pa.array(np.array(["a", "x", "c", "a"])).dictionary_encode(), + pa.array(np.array(["a", "d", "c"])).dictionary_encode(), + ] + ser = pd.Series(ArrowExtensionArray(pa.chunked_array(arrs))) + result = ser.astype("category") + categories = pd.Index(["a", "x", "c", "d"], dtype=ArrowDtype(pa.string())) + expected = pd.Series( + ["a", "x", "c", "a", "a", "d", "c"], + dtype=pd.CategoricalDtype(categories=categories), + ) + tm.assert_series_equal(result, expected) + + def test_arrow_floordiv(): # GH 55561 a = pd.Series([-7], dtype="int64[pyarrow]") From 97eb3315c957bf01831c544d70245e5ebb9f3735 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 Jan 2024 09:21:26 +0100 Subject: [PATCH 025/396] Backport PR #56543 on branch 2.2.x (DOC: Update docstring for read_excel) (#56730) Backport PR #56543: DOC: Update docstring for read_excel Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/user_guide/io.rst | 19 +++++++------------ pandas/io/excel/_base.py | 32 ++++++++++---------------------- 2 files changed, 17 insertions(+), 34 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6148086452d54..b3ad23e0d4104 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3471,20 +3471,15 @@ saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. See the :ref:`cookbook` for some advanced strategies. -.. warning:: - - The `xlrd `__ package is now only for reading - old-style ``.xls`` files. +.. note:: - Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel` - would result in using the ``xlrd`` engine in many cases, including new - Excel 2007+ (``.xlsx``) files. pandas will now default to using the - `openpyxl `__ engine. + When ``engine=None``, the following logic will be used to determine the engine: - It is strongly encouraged to install ``openpyxl`` to read Excel 2007+ - (``.xlsx``) files. - **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.** - This is no longer supported, switch to using ``openpyxl`` instead. + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. + - Otherwise ``openpyxl`` will be used. .. _io.excel_reader: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index bce890c6f73b0..786f719337b84 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -160,36 +160,24 @@ If converters are specified, they will be applied INSTEAD of dtype conversion. If you use ``None``, it will infer the dtype of each column based on the data. -engine : str, default None +engine : {{'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}}, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". Engine compatibility : - - ``xlr`` supports old-style Excel files (.xls). - ``openpyxl`` supports newer Excel file formats. - - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - - ``pyxlsb`` supports Binary Excel files. - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) and OpenDocument (.ods) file formats. + - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). + - ``pyxlsb`` supports Binary Excel files. + - ``xlrd`` supports old-style Excel files (.xls). - .. versionchanged:: 1.2.0 - The engine `xlrd `_ - now only supports old-style ``.xls`` files. - When ``engine=None``, the following logic will be - used to determine the engine: - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is an xls format, - ``xlrd`` will be used. - - Otherwise if ``path_or_buffer`` is in xlsb format, - ``pyxlsb`` will be used. - - .. versionadded:: 1.3.0 - - Otherwise ``openpyxl`` will be used. - - .. versionchanged:: 1.3.0 + When ``engine=None``, the following logic will be used to determine the engine: + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. + - Otherwise ``openpyxl`` will be used. converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one From 54dbe4527b171717535ff3bbc1e13aa748a95ed3 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 5 Jan 2024 19:56:14 +0100 Subject: [PATCH 026/396] Backport PR #56677 on branch 2.2.x (Fix integral truediv and floordiv for pyarrow types with large divisor and avoid floating points for floordiv) (#56744) Backport PR #56677: Fix integral truediv and floordiv for pyarrow types with large divisor and avoid floating points for floordiv Co-authored-by: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 46 +++++++++++++----- pandas/tests/extension/test_arrow.py | 71 +++++++++++++++++++++++++++- 3 files changed, 104 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 4222de8ce324f..0b04a1d313a6d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -786,6 +786,7 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) +- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`) - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0bc01d2da330a..3858ce4cf0ea1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -109,30 +109,50 @@ def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar - ) -> pa.ChunkedArray: + ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]: # Ensure int / int -> float mirroring Python/Numpy behavior # as pc.divide_checked(int, int) -> int if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( pa_object.type ): + # GH: 56645. # https://github.com/apache/arrow/issues/35563 - # Arrow does not allow safe casting large integral values to float64. - # Intentionally not using arrow_array.cast because it could be a scalar - # value in reflected case, and safe=False only added to - # scalar cast in pyarrow 13. - return pc.cast(arrow_array, pa.float64(), safe=False) - return arrow_array + return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast( + pa_object, pa.float64(), safe=False + ) + + return arrow_array, pa_object def floordiv_compat( left: pa.ChunkedArray | pa.Array | pa.Scalar, right: pa.ChunkedArray | pa.Array | pa.Scalar, ) -> pa.ChunkedArray: - # Ensure int // int -> int mirroring Python/Numpy behavior - # as pc.floor(pc.divide_checked(int, int)) -> float - converted_left = cast_for_truediv(left, right) - result = pc.floor(pc.divide(converted_left, right)) + # TODO: Replace with pyarrow floordiv kernel. + # https://github.com/apache/arrow/issues/39386 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): + divided = pc.divide_checked(left, right) + if pa.types.is_signed_integer(divided.type): + # GH 56676 + has_remainder = pc.not_equal(pc.multiply(divided, right), left) + has_one_negative_operand = pc.less( + pc.bit_wise_xor(left, right), + pa.scalar(0, type=divided.type), + ) + result = pc.if_else( + pc.and_( + has_remainder, + has_one_negative_operand, + ), + # GH: 55561 + pc.subtract(divided, pa.scalar(1, type=divided.type)), + divided, + ) + else: + result = divided result = result.cast(left.type) + else: + divided = pc.divide(left, right) + result = pc.floor(divided) return result ARROW_ARITHMETIC_FUNCS = { @@ -142,8 +162,8 @@ def floordiv_compat( "rsub": lambda x, y: pc.subtract_checked(y, x), "mul": pc.multiply_checked, "rmul": lambda x, y: pc.multiply_checked(y, x), - "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y), - "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)), + "truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)), + "rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)), "floordiv": lambda x, y: floordiv_compat(x, y), "rfloordiv": lambda x, y: floordiv_compat(y, x), "mod": NotImplemented, diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6689fb34f2ae3..05a112e464677 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3260,13 +3260,82 @@ def test_arrow_floordiv(): def test_arrow_floordiv_large_values(): - # GH 55561 + # GH 56645 a = pd.Series([1425801600000000000], dtype="int64[pyarrow]") expected = pd.Series([1425801600000], dtype="int64[pyarrow]") result = a // 1_000_000 tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floordiv_large_integral_result(dtype): + # GH 56676 + a = pd.Series([18014398509481983], dtype=dtype) + result = a // 1 + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_larger_divisor(pa_type): + # GH 56676 + dtype = ArrowDtype(pa_type) + a = pd.Series([-23], dtype=dtype) + result = a // 24 + expected = pd.Series([-1], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_integral_invalid(pa_type): + # GH 56676 + min_value = np.iinfo(pa_type.to_pandas_dtype()).min + a = pd.Series([min_value], dtype=ArrowDtype(pa_type)) + with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"): + a // -1 + with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"): + a // 0 + + +@pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR) +def test_arrow_floordiv_floating_0_divisor(dtype): + # GH 56676 + a = pd.Series([2], dtype=dtype) + result = a // 0 + expected = pd.Series([float("inf")], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) +def test_arrow_integral_floordiv_large_values(pa_type): + # GH 56676 + max_value = np.iinfo(pa_type.to_pandas_dtype()).max + dtype = ArrowDtype(pa_type) + a = pd.Series([max_value], dtype=dtype) + b = pd.Series([1], dtype=dtype) + result = a // b + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_true_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype="float64[pyarrow]") + result = a / b + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floor_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype=dtype) + result = a // b + tm.assert_series_equal(result, expected) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] From 57079b6b91673b13109de5196b0679a191d714dc Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 7 Jan 2024 22:02:02 +0100 Subject: [PATCH 027/396] Backport PR #56761 on branch 2.2.x (BUG: fix subclass metadata preservation in groupby column selection) (#56770) Backport PR #56761: BUG: fix subclass metadata preservation in groupby column selection Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/frame.py | 4 +++- pandas/tests/groupby/test_groupby_subclass.py | 8 ++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 0b04a1d313a6d..2b436bc5d1855 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -873,6 +873,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`) +- Bug in :meth:`DataFrame.groupby` for DataFrame subclasses when selecting a subset of columns to apply the function to (:issue:`56761`) - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e2e589440bd9..15ccbd602c9c8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4016,7 +4016,9 @@ def _getitem_nocopy(self, key: list): copy=False, only_slice=True, ) - return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + result = result.__finalize__(self) + return result def __getitem__(self, key): check_dict_or_set_indexers(key) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index bf809bd5db437..17ef6ee913463 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -69,6 +69,7 @@ def test_groupby_preserves_metadata(): def func(group): assert isinstance(group, tm.SubclassedDataFrame) assert hasattr(group, "testattr") + assert group.testattr == "hello" return group.testattr msg = "DataFrameGroupBy.apply operated on the grouping columns" @@ -79,6 +80,13 @@ def func(group): expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) + result = custom_df.groupby("c").apply(func, include_groups=False) + tm.assert_series_equal(result, expected) + + # https://github.com/pandas-dev/pandas/pull/56761 + result = custom_df.groupby("c")[["a", "b"]].apply(func) + tm.assert_series_equal(result, expected) + def func2(group): assert isinstance(group, tm.SubclassedSeries) assert hasattr(group, "testattr") From 41f22b302b18310d58b894b3b04412ca413cbbfe Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 8 Jan 2024 20:25:45 +0100 Subject: [PATCH 028/396] Backport PR #56769 on branch 2.2.x (BUG: replace matching Floats with bools for ea dtypes) (#56780) Backport PR #56769: BUG: replace matching Floats with bools for ea dtypes Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/missing.py | 42 ++++++++++++++------- pandas/tests/series/methods/test_replace.py | 12 ++++++ 3 files changed, 41 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 2b436bc5d1855..b138e91b41661 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -789,6 +789,7 @@ Numeric - Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`) - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 5dd9aaf5fbb4a..ff45662d0bdc8 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -31,6 +31,7 @@ from pandas.core.dtypes.cast import infer_dtype_from from pandas.core.dtypes.common import ( is_array_like, + is_bool_dtype, is_numeric_dtype, is_numeric_v_string_like, is_object_dtype, @@ -100,21 +101,34 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: # GH 21977 mask = np.zeros(arr.shape, dtype=bool) - for x in nonna: - if is_numeric_v_string_like(arr, x): - # GH#29553 prevent numpy deprecation warnings - pass - else: - if potential_na: - new_mask = np.zeros(arr.shape, dtype=np.bool_) - new_mask[arr_mask] = arr[arr_mask] == x + if ( + is_numeric_dtype(arr.dtype) + and not is_bool_dtype(arr.dtype) + and is_bool_dtype(nonna.dtype) + ): + pass + elif ( + is_bool_dtype(arr.dtype) + and is_numeric_dtype(nonna.dtype) + and not is_bool_dtype(nonna.dtype) + ): + pass + else: + for x in nonna: + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + pass else: - new_mask = arr == x - - if not isinstance(new_mask, np.ndarray): - # usually BooleanArray - new_mask = new_mask.to_numpy(dtype=bool, na_value=False) - mask |= new_mask + if potential_na: + new_mask = np.zeros(arr.shape, dtype=np.bool_) + new_mask[arr_mask] = arr[arr_mask] == x + else: + new_mask = arr == x + + if not isinstance(new_mask, np.ndarray): + # usually BooleanArray + new_mask = new_mask.to_numpy(dtype=bool, na_value=False) + mask |= new_mask if na_mask.any(): mask |= isna(arr) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 4330153c186ca..b0f4e233ba5eb 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -799,3 +799,15 @@ def test_replace_numeric_column_with_na(self, val): ser.replace(to_replace=1, value=pd.NA, inplace=True) tm.assert_series_equal(ser, expected) + + def test_replace_ea_float_with_bool(self): + # GH#55398 + ser = pd.Series([0.0], dtype="Float64") + expected = ser.copy() + result = ser.replace(False, 1.0) + tm.assert_series_equal(result, expected) + + ser = pd.Series([False], dtype="boolean") + expected = ser.copy() + result = ser.replace(0.0, True) + tm.assert_series_equal(result, expected) From 6dbeeb4009bbfac5ea1ae2111346f5e9f05b81f4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 8 Jan 2024 23:24:22 +0100 Subject: [PATCH 029/396] Backport PR #56767 on branch 2.2.x (BUG: Series.round raising for nullable bool dtype) (#56782) Backport PR #56767: BUG: Series.round raising for nullable bool dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/masked.py | 2 ++ pandas/core/series.py | 6 ++---- pandas/tests/series/methods/test_round.py | 9 +++++++++ 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b138e91b41661..93b63f99ea399 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -790,6 +790,7 @@ Numeric - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`) +- Bug in :meth:`Series.round` raising for nullable boolean dtype (:issue:`55936`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index fc092ef6eb463..545d45e450f3f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -403,6 +403,8 @@ def round(self, decimals: int = 0, *args, **kwargs): DataFrame.round : Round values of a DataFrame. Series.round : Round values of a Series. """ + if self.dtype.kind == "b": + return self nv.validate_round(args, kwargs) values = np.round(self._data, decimals=decimals, **kwargs) diff --git a/pandas/core/series.py b/pandas/core/series.py index e3b401cd3c88b..a6762dd1b48a2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2788,13 +2788,11 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series: dtype: float64 """ nv.validate_round(args, kwargs) - result = self._values.round(decimals) - result = self._constructor(result, index=self.index, copy=False).__finalize__( + new_mgr = self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()) + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( self, method="round" ) - return result - @overload def quantile( self, q: float = ..., interpolation: QuantileInterpolation = ... diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py index 7f60c94f10e4f..c330b7a7dfbbb 100644 --- a/pandas/tests/series/methods/test_round.py +++ b/pandas/tests/series/methods/test_round.py @@ -63,3 +63,12 @@ def test_round_nat(self, method, freq, unit): round_method = getattr(ser.dt, method) result = round_method(freq) tm.assert_series_equal(result, expected) + + def test_round_ea_boolean(self): + # GH#55936 + ser = Series([True, False], dtype="boolean") + expected = ser.copy() + result = ser.round(2) + tm.assert_series_equal(result, expected) + result.iloc[0] = False + tm.assert_series_equal(ser, expected) From 58c9ef79cb976e14850929bde5d6f9b416122dd0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 8 Jan 2024 23:24:40 +0100 Subject: [PATCH 030/396] Backport PR #56771 on branch 2.2.x (BUG: to_stata not handling ea dtypes correctly) (#56783) Backport PR #56771: BUG: to_stata not handling ea dtypes correctly Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/stata.py | 23 ++++++++++++--------- pandas/tests/io/test_stata.py | 37 ++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 93b63f99ea399..c85fd75a3685f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -848,6 +848,7 @@ I/O - Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`) - Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`) - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) +- Bug in :meth:`DataFrame.to_stata` raising for extension dtypes (:issue:`54671`) - Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`) - Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index a4d8054ea4f8c..4abf9af185a01 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -47,9 +47,11 @@ ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_object, is_numeric_dtype, + is_string_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype @@ -62,8 +64,6 @@ to_datetime, to_timedelta, ) -from pandas.core.arrays.boolean import BooleanDtype -from pandas.core.arrays.integer import IntegerDtype from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index from pandas.core.indexes.range import RangeIndex @@ -591,17 +591,22 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: for col in data: # Cast from unsupported types to supported types - is_nullable_int = isinstance(data[col].dtype, (IntegerDtype, BooleanDtype)) + is_nullable_int = ( + isinstance(data[col].dtype, ExtensionDtype) + and data[col].dtype.kind in "iub" + ) # We need to find orig_missing before altering data below orig_missing = data[col].isna() if is_nullable_int: - missing_loc = data[col].isna() - if missing_loc.any(): - # Replace with always safe value - fv = 0 if isinstance(data[col].dtype, IntegerDtype) else False - data.loc[missing_loc, col] = fv + fv = 0 if data[col].dtype.kind in "iu" else False # Replace with NumPy-compatible column - data[col] = data[col].astype(data[col].dtype.numpy_dtype) + data[col] = data[col].fillna(fv).astype(data[col].dtype.numpy_dtype) + elif isinstance(data[col].dtype, ExtensionDtype): + if getattr(data[col].dtype, "numpy_dtype", None) is not None: + data[col] = data[col].astype(data[col].dtype.numpy_dtype) + elif is_string_dtype(data[col].dtype): + data[col] = data[col].astype("object") + dtype = data[col].dtype empty_df = data.shape[0] == 0 for c_data in conversion_data: diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 3e4e1a107da9d..6bd74faa8a3db 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -11,6 +11,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import CategoricalDtype import pandas._testing as tm @@ -1921,6 +1923,41 @@ def test_writer_118_exceptions(self): with pytest.raises(ValueError, match="You must use version 119"): StataWriterUTF8(path, df, version=118) + @pytest.mark.parametrize( + "dtype_backend", + ["numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], + ) + def test_read_write_ea_dtypes(self, dtype_backend): + df = DataFrame( + { + "a": [1, 2, None], + "b": ["a", "b", "c"], + "c": [True, False, None], + "d": [1.5, 2.5, 3.5], + "e": pd.date_range("2020-12-31", periods=3, freq="D"), + }, + index=pd.Index([0, 1, 2], name="index"), + ) + df = df.convert_dtypes(dtype_backend=dtype_backend) + df.to_stata("test_stata.dta", version=118) + + with tm.ensure_clean() as path: + df.to_stata(path) + written_and_read_again = self.read_dta(path) + + expected = DataFrame( + { + "a": [1, 2, np.nan], + "b": ["a", "b", "c"], + "c": [1.0, 0, np.nan], + "d": [1.5, 2.5, 3.5], + "e": pd.date_range("2020-12-31", periods=3, freq="D"), + }, + index=pd.Index([0, 1, 2], name="index", dtype=np.int32), + ) + + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) + @pytest.mark.parametrize("version", [105, 108, 111, 113, 114]) def test_backward_compat(version, datapath): From bf28e02bd592eba25a8eb8ca296316ff2671f469 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 8 Jan 2024 23:26:47 +0100 Subject: [PATCH 031/396] Backport PR #56766 on branch 2.2.x (BUG: IntervalIndex.from_tuples raising with masked subtype) (#56785) Backport PR #56766: BUG: IntervalIndex.from_tuples raising with masked subtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/interval.py | 18 +++++++++++++----- .../indexes/interval/test_constructors.py | 16 ++++++++++++++++ 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c85fd75a3685f..dbb11d3d0788d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -817,6 +817,7 @@ Interval - Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`) - Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`) - Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`) +- Bug in :meth:`IntervalIndex.from_tuples` raising if subtype is a nullable extension dtype (:issue:`56765`) - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) - Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 904c87c68e211..e69f996441703 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -79,6 +79,7 @@ unique, value_counts_internal as value_counts, ) +from pandas.core.arrays import ArrowExtensionArray from pandas.core.arrays.base import ( ExtensionArray, _extension_array_shared_docs, @@ -370,11 +371,18 @@ def _ensure_simple_new_inputs( right = ensure_wrapped_if_datetimelike(right) right = extract_array(right, extract_numpy=True) - lbase = getattr(left, "_ndarray", left).base - rbase = getattr(right, "_ndarray", right).base - if lbase is not None and lbase is rbase: - # If these share data, then setitem could corrupt our IA - right = right.copy() + if isinstance(left, ArrowExtensionArray) or isinstance( + right, ArrowExtensionArray + ): + pass + else: + lbase = getattr(left, "_ndarray", left) + lbase = getattr(lbase, "_data", lbase).base + rbase = getattr(right, "_ndarray", right) + rbase = getattr(rbase, "_data", rbase).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() dtype = IntervalDtype(left.dtype, closed=closed) diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 778c07b46e57c..e47a014f18045 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.dtypes.dtypes import IntervalDtype @@ -517,3 +519,17 @@ def test_dtype_closed_mismatch(): with pytest.raises(ValueError, match=msg): IntervalArray([], dtype=dtype, closed="neither") + + +@pytest.mark.parametrize( + "dtype", + ["Float64", pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow"))], +) +def test_ea_dtype(dtype): + # GH#56765 + bins = [(0.0, 0.4), (0.4, 0.6)] + interval_dtype = IntervalDtype(subtype=dtype, closed="left") + result = IntervalIndex.from_tuples(bins, closed="left", dtype=interval_dtype) + assert result.dtype == interval_dtype + expected = IntervalIndex.from_tuples(bins, closed="left").astype(interval_dtype) + tm.assert_index_equal(result, expected) From b89079bf0da316aff5433f3ee2b52cb9f89e34d0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 9 Jan 2024 00:27:06 +0100 Subject: [PATCH 032/396] Backport PR #56724 on branch 2.2.x (TST: Don't ignore tolerance for integer series) (#56786) Backport PR #56724: TST: Don't ignore tolerance for integer series Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_testing/asserters.py | 98 ++++++++++++++----- pandas/tests/util/test_assert_series_equal.py | 12 +++ 2 files changed, 84 insertions(+), 26 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index d0f38c85868d4..3de982498e996 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -10,6 +10,7 @@ import numpy as np +from pandas._libs import lib from pandas._libs.missing import is_matching_na from pandas._libs.sparse import SparseIndex import pandas._libs.testing as _testing @@ -698,9 +699,9 @@ def assert_extension_array_equal( right, check_dtype: bool | Literal["equiv"] = True, index_values=None, - check_exact: bool = False, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + check_exact: bool | lib.NoDefault = lib.no_default, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "ExtensionArray", ) -> None: """ @@ -715,7 +716,12 @@ def assert_extension_array_equal( index_values : Index | numpy.ndarray, default None Optional index (shared by both left and right), used in output. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. atol : float, default 1e-8 @@ -739,6 +745,23 @@ def assert_extension_array_equal( >>> b, c = a.array, a.array >>> tm.assert_extension_array_equal(b, c) """ + if ( + check_exact is lib.no_default + and rtol is lib.no_default + and atol is lib.no_default + ): + check_exact = ( + is_numeric_dtype(left.dtype) + and not is_float_dtype(left.dtype) + or is_numeric_dtype(right.dtype) + and not is_float_dtype(right.dtype) + ) + elif check_exact is lib.no_default: + check_exact = False + + rtol = rtol if rtol is not lib.no_default else 1.0e-5 + atol = atol if atol is not lib.no_default else 1.0e-8 + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" if check_dtype: @@ -784,10 +807,7 @@ def assert_extension_array_equal( left_valid = left[~left_na].to_numpy(dtype=object) right_valid = right[~right_na].to_numpy(dtype=object) - if check_exact or ( - (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) - or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) - ): + if check_exact: assert_numpy_array_equal( left_valid, right_valid, obj=obj, index_values=index_values ) @@ -811,14 +831,14 @@ def assert_series_equal( check_index_type: bool | Literal["equiv"] = "equiv", check_series_type: bool = True, check_names: bool = True, - check_exact: bool = False, + check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_category_order: bool = True, check_freq: bool = True, check_flags: bool = True, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "Series", *, check_index: bool = True, @@ -841,7 +861,12 @@ def assert_series_equal( check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -877,6 +902,22 @@ def assert_series_equal( >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True + if ( + check_exact is lib.no_default + and rtol is lib.no_default + and atol is lib.no_default + ): + check_exact = ( + is_numeric_dtype(left.dtype) + and not is_float_dtype(left.dtype) + or is_numeric_dtype(right.dtype) + and not is_float_dtype(right.dtype) + ) + elif check_exact is lib.no_default: + check_exact = False + + rtol = rtol if rtol is not lib.no_default else 1.0e-5 + atol = atol if atol is not lib.no_default else 1.0e-8 if not check_index and check_like: raise ValueError("check_like must be False if check_index is False") @@ -931,10 +972,7 @@ def assert_series_equal( pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - if check_exact or ( - (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) - or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) - ): + if check_exact: left_values = left._values right_values = right._values # Only check exact if dtype is numeric @@ -1061,14 +1099,14 @@ def assert_frame_equal( check_frame_type: bool = True, check_names: bool = True, by_blocks: bool = False, - check_exact: bool = False, + check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_like: bool = False, check_freq: bool = True, check_flags: bool = True, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "DataFrame", ) -> None: """ @@ -1103,7 +1141,12 @@ def assert_frame_equal( Specify how to compare internal data. If False, compare by columns. If True, compare by blocks. check_exact : bool, default False - Whether to compare number exactly. Only takes effect for float dtypes. + Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -1158,6 +1201,9 @@ def assert_frame_equal( >>> assert_frame_equal(df1, df2, check_dtype=False) """ __tracebackhide__ = True + _rtol = rtol if rtol is not lib.no_default else 1.0e-5 + _atol = atol if atol is not lib.no_default else 1.0e-8 + _check_exact = check_exact if check_exact is not lib.no_default else False # instance validation _check_isinstance(left, right, DataFrame) @@ -1181,11 +1227,11 @@ def assert_frame_equal( right.index, exact=check_index_type, check_names=check_names, - check_exact=check_exact, + check_exact=_check_exact, check_categorical=check_categorical, check_order=not check_like, - rtol=rtol, - atol=atol, + rtol=_rtol, + atol=_atol, obj=f"{obj}.index", ) @@ -1195,11 +1241,11 @@ def assert_frame_equal( right.columns, exact=check_column_type, check_names=check_names, - check_exact=check_exact, + check_exact=_check_exact, check_categorical=check_categorical, check_order=not check_like, - rtol=rtol, - atol=atol, + rtol=_rtol, + atol=_atol, obj=f"{obj}.columns", ) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index c4ffc197298f0..784a0347cf92b 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -462,3 +462,15 @@ def test_ea_and_numpy_no_dtype_check(val, check_exact, dtype): left = Series([1, 2, val], dtype=dtype) right = Series(pd.array([1, 2, val])) tm.assert_series_equal(left, right, check_dtype=False, check_exact=check_exact) + + +def test_assert_series_equal_int_tol(): + # GH#56646 + left = Series([81, 18, 121, 38, 74, 72, 81, 81, 146, 81, 81, 170, 74, 74]) + right = Series([72, 9, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72]) + tm.assert_series_equal(left, right, rtol=1.5) + + tm.assert_frame_equal(left.to_frame(), right.to_frame(), rtol=1.5) + tm.assert_extension_array_equal( + left.astype("Int64").values, right.astype("Int64").values, rtol=1.5 + ) From c4e04e0ea10a1d9ff6862cc4c5c6c82aee9e1ba8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 9 Jan 2024 03:28:18 +0100 Subject: [PATCH 033/396] Backport PR #56402 on branch 2.2.x (TST/CoW: expand test for chained inplace methods) (#56790) Backport PR #56402: TST/CoW: expand test for chained inplace methods Co-authored-by: Joris Van den Bossche --- .../test_chained_assignment_deprecation.py | 69 ++++++++++++++++++- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py index 80e38380ed27c..0a37f6b813e55 100644 --- a/pandas/tests/copy_view/test_chained_assignment_deprecation.py +++ b/pandas/tests/copy_view/test_chained_assignment_deprecation.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import PY311 from pandas.errors import ( ChainedAssignmentError, SettingWithCopyWarning, @@ -42,7 +43,9 @@ def test_methods_iloc_warn(using_copy_on_write): ("ffill", ()), ], ) -def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): +def test_methods_iloc_getitem_item_cache( + func, args, using_copy_on_write, warn_copy_on_write +): # ensure we don't incorrectly raise chained assignment warning because # of the item cache / iloc not setting the item cache df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) @@ -66,14 +69,74 @@ def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): ser = df["a"] getattr(ser, func)(*args, inplace=True) + df = df_orig.copy() + df["a"] # populate the item_cache + # TODO(CoW-warn) because of the usage of *args, this doesn't warn on Py3.11+ + if using_copy_on_write: + with tm.raises_chained_assignment_error(not PY311): + getattr(df["a"], func)(*args, inplace=True) + else: + with tm.assert_cow_warning(not PY311, match="A value"): + getattr(df["a"], func)(*args, inplace=True) + + df = df_orig.copy() + ser = df["a"] # populate the item_cache and keep ref + if using_copy_on_write: + with tm.raises_chained_assignment_error(not PY311): + getattr(df["a"], func)(*args, inplace=True) + else: + # ideally also warns on the default mode, but the ser' _cacher + # messes up the refcount + even in warning mode this doesn't trigger + # the warning of Py3.1+ (see above) + with tm.assert_cow_warning(warn_copy_on_write and not PY311, match="A value"): + getattr(df["a"], func)(*args, inplace=True) + + +def test_methods_iloc_getitem_item_cache_fillna( + using_copy_on_write, warn_copy_on_write +): + # ensure we don't incorrectly raise chained assignment warning because + # of the item cache / iloc not setting the item cache + df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) + + df = df_orig.copy() + ser = df.iloc[:, 0] + ser.fillna(1, inplace=True) + + # parent that holds item_cache is dead, so don't increase ref count + df = df_orig.copy() + ser = df.copy()["a"] + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df.iloc[:, 0] # iloc creates a new object + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df["a"] + ser.fillna(1, inplace=True) + df = df_orig.copy() df["a"] # populate the item_cache if using_copy_on_write: with tm.raises_chained_assignment_error(): - df["a"].fillna(0, inplace=True) + df["a"].fillna(1, inplace=True) else: with tm.assert_cow_warning(match="A value"): - df["a"].fillna(0, inplace=True) + df["a"].fillna(1, inplace=True) + + df = df_orig.copy() + ser = df["a"] # populate the item_cache and keep ref + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].fillna(1, inplace=True) + else: + # TODO(CoW-warn) ideally also warns on the default mode, but the ser' _cacher + # messes up the refcount + with tm.assert_cow_warning(warn_copy_on_write, match="A value"): + df["a"].fillna(1, inplace=True) # TODO(CoW-warn) expand the cases From 3c89432d1adf277d785145f5c62abca1157e19dc Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 9 Jan 2024 10:57:14 +0100 Subject: [PATCH 034/396] Backport PR #56772 on branch 2.2.x (Support large strings in interchange protocol) (#56795) Backport PR #56772: Support large strings in interchange protocol Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/dtypes/dtypes.py | 4 +++- pandas/core/interchange/column.py | 9 +++------ pandas/core/interchange/utils.py | 1 + pandas/tests/interchange/test_impl.py | 9 +++++++++ 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index dbb11d3d0788d..36e677fa2a7a9 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -906,6 +906,7 @@ Sparse Other ^^^^^ +- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`) - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`) - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`) - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ed5256922377a..e90e92fa0ee1c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2190,7 +2190,9 @@ def numpy_dtype(self) -> np.dtype: # This can be removed if/when pyarrow addresses it: # https://github.com/apache/arrow/issues/34462 return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") - if pa.types.is_string(self.pyarrow_dtype): + if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( + self.pyarrow_dtype + ): # pa.string().to_pandas_dtype() = object which we don't want return np.dtype(str) try: diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index acfbc5d9e6c62..7f524d6823f30 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -301,12 +301,9 @@ def _get_data_buffer( buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer - dtype = ( - DtypeKind.STRING, - 8, - ArrowCTypes.STRING, - Endianness.NATIVE, - ) # note: currently only support native endianness + # TODO: this will need correcting + # https://github.com/pandas-dev/pandas/issues/54781 + dtype = self.dtype else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 4ac063080e62d..2e73e560e5740 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -37,6 +37,7 @@ "float": "f", # float32 "double": "g", # float64 "string": "u", + "large_string": "U", "binary": "z", "time32[s]": "tts", "time32[ms]": "ttm", diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 15c2b8d000b37..27ea8ccdd17b1 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -362,3 +362,12 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: interchange.get_column_by_name = lambda _: column monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) pd.api.interchange.from_dataframe(df) + + +def test_large_string(): + # GH#56702 + pytest.importorskip("pyarrow") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + expected = pd.DataFrame({"a": ["x"]}, dtype="object") + tm.assert_frame_equal(result, expected) From 3945d5e16b0aca22166ce1e6a03ee978fb8853fd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 9 Jan 2024 16:47:34 +0100 Subject: [PATCH 035/396] Backport PR #5644 on branch 2.2.x (BUG: merge not sorting for new string dtype) (#56799) BUG: merge not sorting for new string dtype (#56442) * BUG: merge not sorting for new string dtype * Fixup * Update test_multi.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit b7e2202459eadc9dd599cbe58251ecc930798b97) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/merge.py | 18 +++- pandas/tests/reshape/merge/test_join.py | 43 ++++++---- pandas/tests/reshape/merge/test_multi.py | 100 ++++++++++++----------- 4 files changed, 94 insertions(+), 68 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 36e677fa2a7a9..6a232365fbfeb 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -893,6 +893,7 @@ Reshaping - Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`) - Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`) - Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`) +- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 320e4e33a29fb..410301b7697f2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2488,18 +2488,30 @@ def _factorize_keys( .combine_chunks() .dictionary_encode() ) - length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length) + pc.fill_null(dc.indices[slice(len_lk)], -1) .to_numpy() .astype(np.intp, copy=False), - pc.fill_null(dc.indices[slice(len_lk, None)], length) + pc.fill_null(dc.indices[slice(len_lk, None)], -1) .to_numpy() .astype(np.intp, copy=False), len(dc.dictionary), ) + + if sort: + uniques = dc.dictionary.to_numpy(zero_copy_only=False) + llab, rlab = _sort_labels(uniques, llab, rlab) + if dc.null_count > 0: + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) count += 1 return llab, rlab, count diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 5a1f47e341222..1d5ed2d7373ce 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -16,6 +16,7 @@ bdate_range, concat, merge, + option_context, ) import pandas._testing as tm @@ -563,24 +564,30 @@ def test_join_many_non_unique_index(self): tm.assert_frame_equal(inner, left) tm.assert_frame_equal(inner, right) - def test_join_sort(self): - left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) - right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) - - joined = left.join(right, on="key", sort=True) - expected = DataFrame( - { - "key": ["bar", "baz", "foo", "foo"], - "value": [2, 3, 1, 4], - "value2": ["a", "b", "c", "c"], - }, - index=[1, 2, 0, 3], - ) - tm.assert_frame_equal(joined, expected) - - # smoke test - joined = left.join(right, on="key", sort=False) - tm.assert_index_equal(joined.index, Index(range(4)), exact=True) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_join_sort(self, infer_string): + with option_context("future.infer_string", infer_string): + left = DataFrame( + {"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]} + ) + right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) + + joined = left.join(right, on="key", sort=True) + expected = DataFrame( + { + "key": ["bar", "baz", "foo", "foo"], + "value": [2, 3, 1, 4], + "value2": ["a", "b", "c", "c"], + }, + index=[1, 2, 0, 3], + ) + tm.assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on="key", sort=False) + tm.assert_index_equal(joined.index, Index(range(4)), exact=True) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 269d3a2b7078e..5973f13c9d495 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -9,6 +11,7 @@ RangeIndex, Series, Timestamp, + option_context, ) import pandas._testing as tm from pandas.core.reshape.concat import concat @@ -88,67 +91,70 @@ def test_merge_on_multikey(self, left, right, join_type): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("sort", [False, True]) - def test_left_join_multi_index(self, sort): - icols = ["1st", "2nd", "3rd"] + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_left_join_multi_index(self, sort, infer_string): + with option_context("future.infer_string", infer_string): + icols = ["1st", "2nd", "3rd"] - def bind_cols(df): - iord = lambda a: 0 if a != a else ord(a) - f = lambda ts: ts.map(iord) - ord("a") - return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 + def bind_cols(df): + iord = lambda a: 0 if a != a else ord(a) + f = lambda ts: ts.map(iord) - ord("a") + return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 - def run_asserts(left, right, sort): - res = left.join(right, on=icols, how="left", sort=sort) + def run_asserts(left, right, sort): + res = left.join(right, on=icols, how="left", sort=sort) - assert len(left) < len(res) + 1 - assert not res["4th"].isna().any() - assert not res["5th"].isna().any() + assert len(left) < len(res) + 1 + assert not res["4th"].isna().any() + assert not res["5th"].isna().any() - tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) - result = bind_cols(res.iloc[:, :-2]) - tm.assert_series_equal(res["4th"], result, check_names=False) - assert result.name is None + tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) + result = bind_cols(res.iloc[:, :-2]) + tm.assert_series_equal(res["4th"], result, check_names=False) + assert result.name is None - if sort: - tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) + if sort: + tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) - out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") + out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") - res.index = RangeIndex(len(res)) - tm.assert_frame_equal(out, res) + res.index = RangeIndex(len(res)) + tm.assert_frame_equal(out, res) - lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) - left = DataFrame( - np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"] - ) - # Explicit cast to float to avoid implicit cast when setting nan - left.insert( - 1, - "2nd", - np.random.default_rng(2).integers(0, 10, len(left)).astype("float"), - ) + lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) + left = DataFrame( + np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"] + ) + # Explicit cast to float to avoid implicit cast when setting nan + left.insert( + 1, + "2nd", + np.random.default_rng(2).integers(0, 10, len(left)).astype("float"), + ) - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i].copy() + i = np.random.default_rng(2).permutation(len(left)) + right = left.iloc[i].copy() - left["4th"] = bind_cols(left) - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) + left["4th"] = bind_cols(left) + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) - run_asserts(left, right, sort) + run_asserts(left, right, sort) - # inject some nulls - left.loc[1::4, "1st"] = np.nan - left.loc[2::5, "2nd"] = np.nan - left.loc[3::6, "3rd"] = np.nan - left["4th"] = bind_cols(left) + # inject some nulls + left.loc[1::4, "1st"] = np.nan + left.loc[2::5, "2nd"] = np.nan + left.loc[3::6, "3rd"] = np.nan + left["4th"] = bind_cols(left) - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i, :-1] - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) + i = np.random.default_rng(2).permutation(len(left)) + right = left.iloc[i, :-1] + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) - run_asserts(left, right, sort) + run_asserts(left, right, sort) @pytest.mark.parametrize("sort", [False, True]) def test_merge_right_vs_left(self, left, right, sort): From 2ddeb4577363e1c3e3a839ec45c55deec4aa67c1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 9 Jan 2024 23:29:45 +0100 Subject: [PATCH 036/396] CI: Add fixture back in (#56803) --- pandas/tests/reshape/merge/test_multi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 5973f13c9d495..b1aa6b88bc4ee 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -94,6 +94,7 @@ def test_merge_on_multikey(self, left, right, join_type): @pytest.mark.parametrize( "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] ) + @pytest.mark.parametrize("sort", [True, False]) def test_left_join_multi_index(self, sort, infer_string): with option_context("future.infer_string", infer_string): icols = ["1st", "2nd", "3rd"] From 66df0bda738c43506a22448b1ba2c7c050a07baa Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 10 Jan 2024 00:46:09 +0100 Subject: [PATCH 037/396] Backport PR #56059 on branch 2.2.x (ENH: Add case_when method) (#56800) ENH: Add case_when method (#56059) (cherry picked from commit e3a55a4cbfc83ec4ab1bcf73a1a0ec96e670903a) Co-authored-by: Samuel Oranyeli --- doc/source/reference/series.rst | 1 + doc/source/whatsnew/v2.2.0.rst | 20 +++ pandas/core/series.py | 124 ++++++++++++++- pandas/tests/series/methods/test_case_when.py | 148 ++++++++++++++++++ 4 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/series/methods/test_case_when.py diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index af262f9e6c336..a4ea0ec396ceb 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -177,6 +177,7 @@ Reindexing / selection / label manipulation :toctree: api/ Series.align + Series.case_when Series.drop Series.droplevel Series.drop_duplicates diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 6a232365fbfeb..e244794664b34 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -188,6 +188,26 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv Implementation Status `_ documentation. +.. _whatsnew_220.enhancements.case_when: + +Create a pandas Series based on one or more conditions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`Series.case_when` function has been added to create a Series object based on one or more conditions. (:issue:`39154`) + +.. ipython:: python + + import pandas as pd + + df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6])) + default=pd.Series('default', index=df.index) + default.case_when( + caselist=[ + (df.a == 1, 'first'), # condition, replacement + (df.a.gt(1) & df.b.eq(5), 'second'), # condition, replacement + ], + ) + .. _whatsnew_220.enhancements.to_numpy_ea: ``to_numpy`` for NumPy nullable and Arrow types converts to suitable NumPy dtype diff --git a/pandas/core/series.py b/pandas/core/series.py index a6762dd1b48a2..83eb545b9b681 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -67,6 +67,9 @@ from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( LossySetitemError, + construct_1d_arraylike_from_scalar, + find_common_type, + infer_dtype_from, maybe_box_native, maybe_cast_pointwise_result, ) @@ -84,7 +87,10 @@ CategoricalDtype, ExtensionDtype, ) -from pandas.core.dtypes.generic import ABCDataFrame +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import ( isna, @@ -113,6 +119,7 @@ from pandas.core.arrays.sparse import SparseAccessor from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( + array as pd_array, extract_array, sanitize_array, ) @@ -5627,6 +5634,121 @@ def between( return lmask & rmask + def case_when( + self, + caselist: list[ + tuple[ + ArrayLike | Callable[[Series], Series | np.ndarray | Sequence[bool]], + ArrayLike | Scalar | Callable[[Series], Series | np.ndarray], + ], + ], + ) -> Series: + """ + Replace values where the conditions are True. + + Parameters + ---------- + caselist : A list of tuples of conditions and expected replacements + Takes the form: ``(condition0, replacement0)``, + ``(condition1, replacement1)``, ... . + ``condition`` should be a 1-D boolean array-like object + or a callable. If ``condition`` is a callable, + it is computed on the Series + and should return a boolean Series or array. + The callable must not change the input Series + (though pandas doesn`t check it). ``replacement`` should be a + 1-D array-like object, a scalar or a callable. + If ``replacement`` is a callable, it is computed on the Series + and should return a scalar or Series. The callable + must not change the input Series + (though pandas doesn`t check it). + + .. versionadded:: 2.2.0 + + Returns + ------- + Series + + See Also + -------- + Series.mask : Replace values where the condition is True. + + Examples + -------- + >>> c = pd.Series([6, 7, 8, 9], name='c') + >>> a = pd.Series([0, 0, 1, 2]) + >>> b = pd.Series([0, 3, 4, 5]) + + >>> c.case_when(caselist=[(a.gt(0), a), # condition, replacement + ... (b.gt(0), b)]) + 0 6 + 1 3 + 2 1 + 3 2 + Name: c, dtype: int64 + """ + if not isinstance(caselist, list): + raise TypeError( + f"The caselist argument should be a list; instead got {type(caselist)}" + ) + + if not caselist: + raise ValueError( + "provide at least one boolean condition, " + "with a corresponding replacement." + ) + + for num, entry in enumerate(caselist): + if not isinstance(entry, tuple): + raise TypeError( + f"Argument {num} must be a tuple; instead got {type(entry)}." + ) + if len(entry) != 2: + raise ValueError( + f"Argument {num} must have length 2; " + "a condition and replacement; " + f"instead got length {len(entry)}." + ) + caselist = [ + ( + com.apply_if_callable(condition, self), + com.apply_if_callable(replacement, self), + ) + for condition, replacement in caselist + ] + default = self.copy() + conditions, replacements = zip(*caselist) + common_dtypes = [infer_dtype_from(arg)[0] for arg in [*replacements, default]] + if len(set(common_dtypes)) > 1: + common_dtype = find_common_type(common_dtypes) + updated_replacements = [] + for condition, replacement in zip(conditions, replacements): + if is_scalar(replacement): + replacement = construct_1d_arraylike_from_scalar( + value=replacement, length=len(condition), dtype=common_dtype + ) + elif isinstance(replacement, ABCSeries): + replacement = replacement.astype(common_dtype) + else: + replacement = pd_array(replacement, dtype=common_dtype) + updated_replacements.append(replacement) + replacements = updated_replacements + default = default.astype(common_dtype) + + counter = reversed(range(len(conditions))) + for position, condition, replacement in zip( + counter, conditions[::-1], replacements[::-1] + ): + try: + default = default.mask( + condition, other=replacement, axis=0, inplace=False, level=None + ) + except Exception as error: + raise ValueError( + f"Failed to apply condition{position} and replacement{position}." + ) from error + return default + # error: Cannot determine type of 'isna' @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] def isna(self) -> Series: diff --git a/pandas/tests/series/methods/test_case_when.py b/pandas/tests/series/methods/test_case_when.py new file mode 100644 index 0000000000000..7cb60a11644a3 --- /dev/null +++ b/pandas/tests/series/methods/test_case_when.py @@ -0,0 +1,148 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + array as pd_array, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture +def df(): + """ + base dataframe for testing + """ + return DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + +def test_case_when_caselist_is_not_a_list(df): + """ + Raise ValueError if caselist is not a list. + """ + msg = "The caselist argument should be a list; " + msg += "instead got.+" + with pytest.raises(TypeError, match=msg): # GH39154 + df["a"].case_when(caselist=()) + + +def test_case_when_no_caselist(df): + """ + Raise ValueError if no caselist is provided. + """ + msg = "provide at least one boolean condition, " + msg += "with a corresponding replacement." + with pytest.raises(ValueError, match=msg): # GH39154 + df["a"].case_when([]) + + +def test_case_when_odd_caselist(df): + """ + Raise ValueError if no of caselist is odd. + """ + msg = "Argument 0 must have length 2; " + msg += "a condition and replacement; instead got length 3." + + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), 1, df.a.gt(1))]) + + +def test_case_when_raise_error_from_mask(df): + """ + Raise Error from within Series.mask + """ + msg = "Failed to apply condition0 and replacement0." + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), [1, 2])]) + + +def test_case_when_single_condition(df): + """ + Test output on a single condition. + """ + result = Series([np.nan, np.nan, np.nan]).case_when([(df.a.eq(1), 1)]) + expected = Series([1, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions(df): + """ + Test output when booleans are derived from a computation + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [(df.a.eq(1), 1), (Series([False, True, False]), 2)] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_list(df): + """ + Test output when replacement is a list + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [([True, False, False], 1), (df["a"].gt(1) & df["b"].eq(5), [1, 2, 3])] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_extension_dtype(df): + """ + Test output when replacement has an extension dtype + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + ([True, False, False], 1), + (df["a"].gt(1) & df["b"].eq(5), pd_array([1, 2, 3], dtype="Int64")), + ], + ) + expected = Series([1, 2, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_series(df): + """ + Test output when replacement is a Series + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + (np.array([True, False, False]), 1), + (df["a"].gt(1) & df["b"].eq(5), Series([1, 2, 3])), + ], + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_non_range_index(): + """ + Test output if index is not RangeIndex + """ + rng = np.random.default_rng(seed=123) + dates = date_range("1/1/2000", periods=8) + df = DataFrame( + rng.standard_normal(size=(8, 4)), index=dates, columns=["A", "B", "C", "D"] + ) + result = Series(5, index=df.index, name="A").case_when([(df.A.gt(0), df.B)]) + expected = df.A.mask(df.A.gt(0), df.B).where(df.A.gt(0), 5) + tm.assert_series_equal(result, expected) + + +def test_case_when_callable(): + """ + Test output on a callable + """ + # https://numpy.org/doc/stable/reference/generated/numpy.piecewise.html + x = np.linspace(-2.5, 2.5, 6) + ser = Series(x) + result = ser.case_when( + caselist=[ + (lambda df: df < 0, lambda df: -df), + (lambda df: df >= 0, lambda df: df), + ] + ) + expected = np.piecewise(x, [x < 0, x >= 0], [lambda x: -x, lambda x: x]) + tm.assert_series_equal(result, Series(expected)) From 596ea0ba0511aa3c6c94baa0222a26c2d6291522 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 10 Jan 2024 12:47:41 +0000 Subject: [PATCH 038/396] 'Backport PR #56146: BUG raise pdep6 warning for loc full setter' (#56807) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/indexing.py | 20 +++++++++++++++ pandas/core/internals/blocks.py | 3 +++ pandas/tests/copy_view/test_indexing.py | 7 +++++- pandas/tests/frame/indexing/test_indexing.py | 21 ++++++++-------- pandas/tests/frame/indexing/test_setitem.py | 20 +++++++++++++++ pandas/tests/frame/methods/test_update.py | 5 +--- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/indexing/test_iloc.py | 6 +++-- pandas/tests/indexing/test_loc.py | 25 +++++++++++++------ pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 4 +-- pandas/tests/series/indexing/test_indexing.py | 4 +-- 13 files changed, 89 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e244794664b34..9d577aa5ac426 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -817,6 +817,7 @@ Conversion - Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) - Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`) - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) +- Bug in ``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`) Strings ^^^^^^^ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 4be7e17035128..934ba3a4d7f29 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2141,6 +2141,26 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: # If we're setting an entire column and we can't do it inplace, # then we can use value's dtype (or inferred dtype) # instead of object + dtype = self.obj.dtypes.iloc[loc] + if dtype not in (np.void, object) and not self.obj.empty: + # - Exclude np.void, as that is a special case for expansion. + # We want to warn for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'a'] = .3 + # but not for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'b'] = .3 + # - Exclude `object`, as then no upcasting happens. + # - Exclude empty initial object with enlargement, + # as then there's nothing to be inconsistent with. + warnings.warn( + f"Setting an item of incompatible dtype is deprecated " + "and will raise in a future error of pandas. " + f"Value '{value}' has dtype incompatible with {dtype}, " + "please explicitly cast to a compatible dtype first.", + FutureWarning, + stacklevel=find_stack_level(), + ) self.obj.isetitem(loc, value) else: # set value into the column (first attempting to operate inplace, then diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 06fd9ebe47eae..70a27300bd60f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -499,6 +499,9 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and is_integer_dtype(self.values.dtype) and isna(other) and other is not NaT + and not ( + isinstance(other, (np.datetime64, np.timedelta64)) and np.isnat(other) + ) ): warn_on_upcast = False elif ( diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 2681c07f01990..479fa148f994a 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1144,11 +1144,16 @@ def test_set_value_copy_only_necessary_column( df_orig = df.copy() view = df[:] - if val == "a" and indexer[0] != slice(None): + if val == "a" and not warn_copy_on_write: with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype is deprecated" ): indexer_func(df)[indexer] = val + if val == "a" and warn_copy_on_write: + with tm.assert_produces_warning( + FutureWarning, match="incompatible dtype|Setting a value on a view" + ): + indexer_func(df)[indexer] = val else: with tm.assert_cow_warning(warn_copy_on_write and val == 100): indexer_func(df)[indexer] = val diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 97e7ae15c6c63..22d9c7f26a57c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -949,7 +949,8 @@ def test_setitem_frame_upcast(self): # needs upcasting df = DataFrame([[1, 2, "foo"], [3, 4, "bar"]], columns=["A", "B", "C"]) df2 = df.copy() - df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 expected = df.reindex(columns=["A", "B"]) expected += 0.5 expected["C"] = df["C"] @@ -1387,20 +1388,20 @@ def test_loc_expand_empty_frame_keep_midx_names(self): tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( - "val, idxr, warn", + "val, idxr", [ - ("x", "a", None), # TODO: this should warn as well - ("x", ["a"], None), # TODO: this should warn as well - (1, "a", None), # TODO: this should warn as well - (1, ["a"], FutureWarning), + ("x", "a"), + ("x", ["a"]), + (1, "a"), + (1, ["a"]), ], ) - def test_loc_setitem_rhs_frame(self, idxr, val, warn): + def test_loc_setitem_rhs_frame(self, idxr, val): # GH#47578 df = DataFrame({"a": [1, 2]}) with tm.assert_produces_warning( - warn, match="Setting an item of incompatible dtype" + FutureWarning, match="Setting an item of incompatible dtype" ): df.loc[:, idxr] = DataFrame({"a": [val, 11]}, index=[1, 2]) expected = DataFrame({"a": [np.nan, val]}) @@ -1996,7 +1997,7 @@ def _check_setitem_invalid(self, df, invalid, indexer, warn): np.datetime64("NaT"), np.timedelta64("NaT"), ] - _indexers = [0, [0], slice(0, 1), [True, False, False]] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] @pytest.mark.parametrize( "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] @@ -2010,7 +2011,7 @@ def test_setitem_validation_scalar_bool(self, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) - if isna(invalid) and invalid is not pd.NaT: + if isna(invalid) and invalid is not pd.NaT and not np.isnat(invalid): warn = None else: warn = FutureWarning diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index e802a56ecbc81..99233d3cd4cf3 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1381,3 +1381,23 @@ def test_frame_setitem_empty_dataframe(self): index=dti[:0], ) tm.assert_frame_equal(df, expected) + + +def test_full_setter_loc_incompatible_dtype(): + # https://github.com/pandas-dev/pandas/issues/55791 + df = DataFrame({"a": [1, 2]}) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "a"] = True + expected = DataFrame({"a": [True, True]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2]}) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "a"] = {0: 3.5, 1: 4.5} + expected = DataFrame({"a": [3.5, 4.5]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2]}) + df.loc[:, "a"] = {0: 3, 1: 4} + expected = DataFrame({"a": [3, 4]}) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 7c7a0d23ff75f..20ba550beeb30 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -160,11 +160,8 @@ def test_update_with_different_dtype(self, using_copy_on_write): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) df["c"] = np.nan - if using_copy_on_write: + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): df.update({"c": Series(["foo"], index=[0])}) - else: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - df["c"].update(Series(["foo"], index=[0])) expected = DataFrame( { diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6e818d79d5ba8..acd0675fd43ec 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2857,7 +2857,7 @@ def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type): ) result = DataFrame({key_val: [1, 2]}, columns=cols) expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols) - expected.iloc[:, 1] = expected.iloc[:, 1].astype(object) + expected.isetitem(1, expected.iloc[:, 1].astype(object)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 409eca42f404b..43dd3812e8b7d 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -535,7 +535,8 @@ def test_iloc_setitem_frame_duplicate_columns_multiple_blocks( # if the assigned values cannot be held by existing integer arrays, # we cast - df.iloc[:, 0] = df.iloc[:, 0] + 0.5 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.iloc[:, 0] = df.iloc[:, 0] + 0.5 if not using_array_manager: assert len(df._mgr.blocks) == 2 @@ -1471,6 +1472,7 @@ def test_iloc_setitem_pure_position_based(self): def test_iloc_nullable_int64_size_1_nan(self): # GH 31861 result = DataFrame({"a": ["test"], "b": [np.nan]}) - result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") expected = DataFrame({"a": ["test"], "b": array([NA], dtype="Int64")}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index fb0adc56c401b..61c44c8a2a8f4 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -584,7 +584,8 @@ def test_loc_setitem_consistency(self, frame_for_consistency, val): } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = val + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = val tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): @@ -598,7 +599,8 @@ def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = "foo" + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = "foo" tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): @@ -611,14 +613,16 @@ def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = 1.0 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = 1.0 tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_single_row(self): # GH 15494 # setting on frame with single row df = DataFrame({"date": Series([Timestamp("20180101")])}) - df.loc[:, "date"] = "string" + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = "string" expected = DataFrame({"date": Series(["string"])}) tm.assert_frame_equal(df, expected) @@ -678,9 +682,10 @@ def test_loc_setitem_consistency_slice_column_len(self): # timedelta64[m] -> float, so this cannot be done inplace, so # no warning - df.loc[:, ("Respondent", "Duration")] = df.loc[ - :, ("Respondent", "Duration") - ] / Timedelta(60_000_000_000) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, ("Respondent", "Duration")] = df.loc[ + :, ("Respondent", "Duration") + ] / Timedelta(60_000_000_000) expected = Series( [23.0, 12.0, 14.0, 36.0], index=df.index, name=("Respondent", "Duration") @@ -1487,7 +1492,11 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # if result started off with object dtype, then the .loc.__setitem__ # below would retain object dtype result = DataFrame(index=idx, columns=["var"], dtype=np.float64) - result.loc[:, idxer] = expected + with tm.assert_produces_warning( + FutureWarning if idxer == "var" else None, match="incompatible dtype" + ): + # See https://github.com/pandas-dev/pandas/issues/56223 + result.loc[:, idxer] = expected tm.assert_frame_equal(result, expected) def test_loc_setitem_time_key(self, using_array_manager): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0eefb0b52c483..1da27ad173235 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -179,7 +179,7 @@ def test_frame_non_unique_columns(self, orient, data): # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need # TODO: a to_epoch method would also solve; see GH 14772 - expected.iloc[:, 0] = expected.iloc[:, 0].astype(np.int64) // 1000000 + expected.isetitem(0, expected.iloc[:, 0].astype(np.int64) // 1000000) elif orient == "split": expected = df expected.columns = ["x", "x.1"] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ab8d22e567d27..27959609422f3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2984,9 +2984,9 @@ def test_merge_empty_frames_column_order(left_empty, right_empty): if left_empty and right_empty: expected = expected.iloc[:0] elif left_empty: - expected.loc[:, "B"] = np.nan + expected["B"] = np.nan elif right_empty: - expected.loc[:, ["C", "D"]] = np.nan + expected[["C", "D"]] = np.nan tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index c52e47a812183..f4992b758af74 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -491,7 +491,7 @@ def _check_setitem_invalid(self, ser, invalid, indexer, warn): np.datetime64("NaT"), np.timedelta64("NaT"), ] - _indexers = [0, [0], slice(0, 1), [True, False, False]] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] @pytest.mark.parametrize( "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] @@ -505,7 +505,7 @@ def test_setitem_validation_scalar_bool(self, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) - if isna(invalid) and invalid is not NaT: + if isna(invalid) and invalid is not NaT and not np.isnat(invalid): warn = None else: warn = FutureWarning From 8757a3c9b35ae2be257f35495f1e017b4ca765fe Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 10 Jan 2024 17:27:15 +0100 Subject: [PATCH 039/396] Backport PR #56757 on branch 2.2.x (ENH: Implement interpolation for arrow and masked dtypes) (#56809) ENH: Implement interpolation for arrow and masked dtypes (#56757) * ENH: Implement interpolation for arrow and masked dtypes * Fixup * Fix typing * Update (cherry picked from commit 5fc2ed2703a1370207f4ebad834e665b6c2ad42f) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 40 ++++++++++++++ pandas/core/arrays/masked.py | 54 +++++++++++++++++++ pandas/core/missing.py | 14 +++-- .../tests/frame/methods/test_interpolate.py | 41 ++++++++++++-- 5 files changed, 143 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 9d577aa5ac426..9a9ac769a4893 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -343,6 +343,7 @@ Other enhancements - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`) - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) +- Implement :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for :class:`ArrowDtype` and masked dtypes (:issue:`56267`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`) - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3858ce4cf0ea1..a5ce46ed612f3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -182,6 +182,7 @@ def floordiv_compat( AxisInt, Dtype, FillnaOptions, + InterpolateOptions, Iterator, NpDtype, NumpySorter, @@ -2048,6 +2049,45 @@ def _maybe_convert_setitem_value(self, value): raise TypeError(msg) from err return value + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> Self: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + mask = self.isna() + if self.dtype.kind == "f": + data = self._pa_array.to_numpy() + elif self.dtype.kind in "iu": + data = self.to_numpy(dtype="f8", na_value=0.0) + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + return type(self)(self._box_pa_array(pa.array(data, mask=mask))) + @classmethod def _if_else( cls, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 545d45e450f3f..234d96e53a67c 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -22,6 +22,7 @@ AxisInt, DtypeObj, FillnaOptions, + InterpolateOptions, NpDtype, PositionalIndexer, Scalar, @@ -98,6 +99,7 @@ NumpySorter, NumpyValueArrayLike, ) + from pandas.core.arrays import FloatingArray from pandas.compat.numpy import function as nv @@ -1491,6 +1493,58 @@ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): else: return self.dtype.na_value + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> FloatingArray: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + if self.dtype.kind == "f": + if copy: + data = self._data.copy() + mask = self._mask.copy() + else: + data = self._data + mask = self._mask + elif self.dtype.kind in "iu": + copy = True + data = self._data.astype("f8") + mask = self._mask.copy() + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + if not copy: + return self # type: ignore[return-value] + if self.dtype.kind == "f": + return type(self)._simple_new(data, mask) # type: ignore[return-value] + else: + from pandas.core.arrays import FloatingArray + + return FloatingArray._simple_new(data, mask) + def _accumulate( self, name: str, *, skipna: bool = True, **kwargs ) -> BaseMaskedArray: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index ff45662d0bdc8..c016aab8ad074 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -349,6 +349,7 @@ def interpolate_2d_inplace( limit_direction: str = "forward", limit_area: str | None = None, fill_value: Any | None = None, + mask=None, **kwargs, ) -> None: """ @@ -396,6 +397,7 @@ def func(yvalues: np.ndarray) -> None: limit_area=limit_area_validated, fill_value=fill_value, bounds_error=False, + mask=mask, **kwargs, ) @@ -440,6 +442,7 @@ def _interpolate_1d( fill_value: Any | None = None, bounds_error: bool = False, order: int | None = None, + mask=None, **kwargs, ) -> None: """ @@ -453,8 +456,10 @@ def _interpolate_1d( ----- Fills 'yvalues' in-place. """ - - invalid = isna(yvalues) + if mask is not None: + invalid = mask + else: + invalid = isna(yvalues) valid = ~invalid if not valid.any(): @@ -531,7 +536,10 @@ def _interpolate_1d( **kwargs, ) - if is_datetimelike: + if mask is not None: + mask[:] = False + mask[preserve_nans] = True + elif is_datetimelike: yvalues[preserve_nans] = NaT.value else: yvalues[preserve_nans] = np.nan diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index e0641fcb65bd3..252b950004bea 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -508,8 +508,41 @@ def test_interpolate_empty_df(self): assert result is None tm.assert_frame_equal(df, expected) - def test_interpolate_ea_raise(self): + def test_interpolate_ea(self, any_int_ea_dtype): # GH#55347 - df = DataFrame({"a": [1, None, 2]}, dtype="Int64") - with pytest.raises(NotImplementedError, match="does not implement"): - df.interpolate() + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=any_int_ea_dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="Float64") + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + [ + "Float64", + "Float32", + pytest.param("float32[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_interpolate_ea_float(self, dtype): + # GH#55347 + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype=dtype) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + ["int64", "uint64", "int32", "int16", "int8", "uint32", "uint16", "uint8"], + ) + def test_interpolate_arrow(self, dtype): + # GH#55347 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype + "[pyarrow]") + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="float64[pyarrow]") + tm.assert_frame_equal(result, expected) From 24ea67fcf0cf982d011d249f2a711ef178e13065 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 10 Jan 2024 09:53:27 -0800 Subject: [PATCH 040/396] BLD: Pin numpy on 2.2.x (#56812) --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8724a25909543..2f70ade7b3afe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,9 +30,9 @@ authors = [ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ - "numpy>=1.22.4; python_version<'3.11'", - "numpy>=1.23.2; python_version=='3.11'", - "numpy>=1.26.0; python_version>='3.12'", + "numpy>=1.22.4,<2; python_version<'3.11'", + "numpy>=1.23.2,<2; python_version=='3.11'", + "numpy>=1.26.0,<2; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.7" From 922a671b41dbdeb20856a07eb91b1949bc827a3c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 10 Jan 2024 19:07:53 +0100 Subject: [PATCH 041/396] Backport PR #56594 on branch 2.2.x (DEPR: the method is_anchored() for offsets) (#56813) Backport PR #56594: DEPR: the method is_anchored() for offsets Co-authored-by: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 + pandas/_libs/tslibs/offsets.pyx | 54 ++++++++++++++++++- .../indexes/interval/test_interval_range.py | 4 +- .../tseries/offsets/test_business_quarter.py | 19 ++++--- pandas/tests/tseries/offsets/test_fiscal.py | 22 ++++---- pandas/tests/tseries/offsets/test_offsets.py | 7 ++- pandas/tests/tseries/offsets/test_quarter.py | 19 ++++--- pandas/tests/tseries/offsets/test_ticks.py | 5 +- pandas/tests/tseries/offsets/test_week.py | 12 +++-- 9 files changed, 111 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 9a9ac769a4893..b5df0a319bc18 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -665,11 +665,13 @@ Other Deprecations - Deprecated :func:`pd.core.internals.api.make_block`, use public APIs instead (:issue:`40226`) - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) +- Deprecated :meth:`DateOffset.is_anchored`, use ``obj.n == 1`` for non-Tick subclasses (for Tick this was always False) (:issue:`55388`) - Deprecated :meth:`DatetimeArray.__init__` and :meth:`TimedeltaArray.__init__`, use :func:`array` instead (:issue:`55623`) - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) - Deprecated :meth:`Series.ravel`, the underlying array is already 1D, so ravel is not necessary (:issue:`52511`) - Deprecated :meth:`Series.resample` and :meth:`DataFrame.resample` with a :class:`PeriodIndex` (and the 'convention' keyword), convert to :class:`DatetimeIndex` (with ``.to_timestamp()``) before resampling instead (:issue:`53481`) - Deprecated :meth:`Series.view`, use :meth:`Series.astype` instead to change the dtype (:issue:`20251`) +- Deprecated :meth:`offsets.Tick.is_anchored`, use ``False`` instead (:issue:`55388`) - Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`) - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) - Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index b3788b6003e67..3a339171d0da2 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -756,11 +756,14 @@ cdef class BaseOffset: raise ValueError(f"{self} is a non-fixed frequency") def is_anchored(self) -> bool: - # TODO: Does this make sense for the general case? It would help - # if there were a canonical docstring for what is_anchored means. + # GH#55388 """ Return boolean whether the frequency is a unit frequency (n=1). + .. deprecated:: 2.2.0 + is_anchored is deprecated and will be removed in a future version. + Use ``obj.n == 1`` instead. + Examples -------- >>> pd.DateOffset().is_anchored() @@ -768,6 +771,12 @@ cdef class BaseOffset: >>> pd.DateOffset(2).is_anchored() False """ + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 # ------------------------------------------------------------------ @@ -954,6 +963,27 @@ cdef class Tick(SingleConstructorOffset): return True def is_anchored(self) -> bool: + # GH#55388 + """ + Return False. + + .. deprecated:: 2.2.0 + is_anchored is deprecated and will be removed in a future version. + Use ``False`` instead. + + Examples + -------- + >>> pd.offsets.Hour().is_anchored() + False + >>> pd.offsets.Hour(2).is_anchored() + False + """ + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use False instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return False # This is identical to BaseOffset.__hash__, but has to be redefined here @@ -2663,6 +2693,13 @@ cdef class QuarterOffset(SingleConstructorOffset): return f"{self._prefix}-{month}" def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1 " + f"and obj.startingMonth is not None\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 and self.startingMonth is not None def is_on_offset(self, dt: datetime) -> bool: @@ -3308,6 +3345,13 @@ cdef class Week(SingleConstructorOffset): self._cache = state.pop("_cache", {}) def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1 " + f"and obj.weekday is not None\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 and self.weekday is not None @apply_wraps @@ -3597,6 +3641,12 @@ cdef class FY5253Mixin(SingleConstructorOffset): self.variation = state.pop("variation") def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return ( self.n == 1 and self.startingMonth is not None and self.weekday is not None ) diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index d4d4a09c44d13..e8de59f84bcc6 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -84,9 +84,7 @@ def test_constructor_timestamp(self, closed, name, freq, periods, tz): tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods - if not breaks.freq.is_anchored() and tz is None: - # matches expected only for non-anchored offsets and tz naive - # (anchored/DST transitions cause unequal spacing in expected) + if not breaks.freq.n == 1 and tz is None: result = interval_range( start=start, end=end, periods=periods, name=name, closed=closed ) diff --git a/pandas/tests/tseries/offsets/test_business_quarter.py b/pandas/tests/tseries/offsets/test_business_quarter.py index 44a7f16ab039d..6d7a115054b7f 100644 --- a/pandas/tests/tseries/offsets/test_business_quarter.py +++ b/pandas/tests/tseries/offsets/test_business_quarter.py @@ -9,6 +9,7 @@ import pytest +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( assert_is_on_offset, assert_offset_equal, @@ -54,9 +55,12 @@ def test_repr(self): assert repr(BQuarterBegin(startingMonth=1)) == expected def test_is_anchored(self): - assert BQuarterBegin(startingMonth=1).is_anchored() - assert BQuarterBegin().is_anchored() - assert not BQuarterBegin(2, startingMonth=1).is_anchored() + msg = "BQuarterBegin.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert BQuarterBegin(startingMonth=1).is_anchored() + assert BQuarterBegin().is_anchored() + assert not BQuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -177,9 +181,12 @@ def test_repr(self): assert repr(BQuarterEnd(startingMonth=1)) == expected def test_is_anchored(self): - assert BQuarterEnd(startingMonth=1).is_anchored() - assert BQuarterEnd().is_anchored() - assert not BQuarterEnd(2, startingMonth=1).is_anchored() + msg = "BQuarterEnd.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert BQuarterEnd(startingMonth=1).is_anchored() + assert BQuarterEnd().is_anchored() + assert not BQuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py index 7f8c34bc6832e..824e66a1ddef1 100644 --- a/pandas/tests/tseries/offsets/test_fiscal.py +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -7,6 +7,7 @@ import pytest from pandas import Timestamp +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( WeekDay, assert_is_on_offset, @@ -295,15 +296,18 @@ def test_apply(self): class TestFY5253LastOfMonthQuarter: def test_is_anchored(self): - assert makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).is_anchored() - assert makeFY5253LastOfMonthQuarter( - weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 - ).is_anchored() - assert not makeFY5253LastOfMonthQuarter( - 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).is_anchored() + msg = "FY5253Quarter.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).is_anchored() + assert makeFY5253LastOfMonthQuarter( + weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 + ).is_anchored() + assert not makeFY5253LastOfMonthQuarter( + 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).is_anchored() def test_equality(self): assert makeFY5253LastOfMonthQuarter( diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index ddf56e68b1611..62afb8b83d576 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -625,8 +625,11 @@ def test_default_constructor(self, dt): assert (dt + DateOffset(2)) == datetime(2008, 1, 4) def test_is_anchored(self): - assert not DateOffset(2).is_anchored() - assert DateOffset(1).is_anchored() + msg = "DateOffset.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert not DateOffset(2).is_anchored() + assert DateOffset(1).is_anchored() def test_copy(self): assert DateOffset(months=2).copy() == DateOffset(months=2) diff --git a/pandas/tests/tseries/offsets/test_quarter.py b/pandas/tests/tseries/offsets/test_quarter.py index d183645da507d..5fd3ba0a5fb87 100644 --- a/pandas/tests/tseries/offsets/test_quarter.py +++ b/pandas/tests/tseries/offsets/test_quarter.py @@ -9,6 +9,7 @@ import pytest +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( assert_is_on_offset, assert_offset_equal, @@ -53,9 +54,12 @@ def test_repr(self): assert repr(QuarterBegin(startingMonth=1)) == expected def test_is_anchored(self): - assert QuarterBegin(startingMonth=1).is_anchored() - assert QuarterBegin().is_anchored() - assert not QuarterBegin(2, startingMonth=1).is_anchored() + msg = "QuarterBegin.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert QuarterBegin(startingMonth=1).is_anchored() + assert QuarterBegin().is_anchored() + assert not QuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -161,9 +165,12 @@ def test_repr(self): assert repr(QuarterEnd(startingMonth=1)) == expected def test_is_anchored(self): - assert QuarterEnd(startingMonth=1).is_anchored() - assert QuarterEnd().is_anchored() - assert not QuarterEnd(2, startingMonth=1).is_anchored() + msg = "QuarterEnd.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert QuarterEnd(startingMonth=1).is_anchored() + assert QuarterEnd().is_anchored() + assert not QuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index b68b91826bc6f..399b7038d3426 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -339,7 +339,10 @@ def test_tick_equalities(cls): @pytest.mark.parametrize("cls", tick_classes) def test_tick_offset(cls): - assert not cls().is_anchored() + msg = f"{cls.__name__}.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert not cls().is_anchored() @pytest.mark.parametrize("cls", tick_classes) diff --git a/pandas/tests/tseries/offsets/test_week.py b/pandas/tests/tseries/offsets/test_week.py index f42ff091af277..0cd6f769769ae 100644 --- a/pandas/tests/tseries/offsets/test_week.py +++ b/pandas/tests/tseries/offsets/test_week.py @@ -21,6 +21,7 @@ WeekOfMonth, ) +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( WeekDay, assert_is_on_offset, @@ -42,10 +43,13 @@ def test_corner(self): Week(weekday=-1) def test_is_anchored(self): - assert Week(weekday=0).is_anchored() - assert not Week().is_anchored() - assert not Week(2, weekday=2).is_anchored() - assert not Week(2).is_anchored() + msg = "Week.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Week(weekday=0).is_anchored() + assert not Week().is_anchored() + assert not Week(2, weekday=2).is_anchored() + assert not Week(2).is_anchored() offset_cases = [] # not business week From e28b4016f7594df9f1a57af55a3f17ab30c0fc89 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 10 Jan 2024 23:40:55 +0100 Subject: [PATCH 042/396] Backport PR #56788 on branch 2.2.x (Bug: Interchange protocol implementation does not allow for empty string columns) (#56816) Backport PR #56788: Bug: Interchange protocol implementation does not allow for empty string columns Co-authored-by: yashb <74137864+roadrollerdafjorst@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/interchange/column.py | 2 +- pandas/tests/interchange/test_impl.py | 8 ++++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b5df0a319bc18..3a4c9438dbc21 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -936,6 +936,7 @@ Other - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`) - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`) +- Bug in :func:`pd.api.interchange.from_dataframe` where it raised ``NotImplementedError`` when handling empty string columns (:issue:`56703`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`) @@ -944,7 +945,6 @@ Other - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) - .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 7f524d6823f30..ee1b5cd34a7f7 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -116,7 +116,7 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]: Endianness.NATIVE, ) elif is_string_dtype(dtype): - if infer_dtype(self._col) == "string": + if infer_dtype(self._col) in ("string", "empty"): return ( DtypeKind.STRING, 8, diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 27ea8ccdd17b1..c7b13f9fd7b2d 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -364,6 +364,14 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) +def test_empty_string_column(): + # https://github.com/pandas-dev/pandas/issues/56703 + df = pd.DataFrame({"a": []}, dtype=str) + df2 = df.__dataframe__() + result = pd.api.interchange.from_dataframe(df2) + tm.assert_frame_equal(df, result) + + def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") From 59c0a2dddc6736e189431cb65e3e943b94786db5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 11 Jan 2024 02:14:18 +0100 Subject: [PATCH 043/396] Backport PR #56481 on branch 2.2.x (Revert "DEPR: make_block (#56422)") (#56814) Backport PR #56481: Revert "DEPR: make_block (#56422)" Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/internals/api.py | 11 +--------- pandas/tests/internals/test_api.py | 4 +--- pandas/tests/internals/test_internals.py | 20 ++++++------------- .../tests/io/parser/common/test_chunksize.py | 1 - pandas/tests/io/parser/test_parse_dates.py | 9 +++------ 6 files changed, 12 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 3a4c9438dbc21..4265447f05b8b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -662,7 +662,6 @@ Other Deprecations - Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) - Deprecated :attr:`offsets.Day.delta`, :attr:`offsets.Hour.delta`, :attr:`offsets.Minute.delta`, :attr:`offsets.Second.delta`, :attr:`offsets.Milli.delta`, :attr:`offsets.Micro.delta`, :attr:`offsets.Nano.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`) - Deprecated :func:`pandas.api.types.is_interval` and :func:`pandas.api.types.is_period`, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`) -- Deprecated :func:`pd.core.internals.api.make_block`, use public APIs instead (:issue:`40226`) - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) - Deprecated :meth:`DateOffset.is_anchored`, use ``obj.n == 1`` for non-Tick subclasses (for Tick this was always False) (:issue:`55388`) @@ -722,6 +721,7 @@ Other Deprecations - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`) +- .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index e5ef44d07061e..b0b3937ca47ea 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -9,12 +9,10 @@ from __future__ import annotations from typing import TYPE_CHECKING -import warnings import numpy as np from pandas._libs.internals import BlockPlacement -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import ( @@ -52,14 +50,6 @@ def make_block( - Block.make_block_same_class - Block.__init__ """ - warnings.warn( - # GH#40226 - "make_block is deprecated and will be removed in a future version. " - "Use public APIs instead.", - DeprecationWarning, - stacklevel=find_stack_level(), - ) - if dtype is not None: dtype = pandas_dtype(dtype) @@ -123,6 +113,7 @@ def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int def __getattr__(name: str): # GH#55139 + import warnings if name in [ "Block", diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index f816cef38b9ab..1251a6ae97a1c 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -68,9 +68,7 @@ def test_deprecations(name): def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") - msg = "make_block is deprecated" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - blk = api.make_block(dti, placement=[0]) + blk = api.make_block(dti, placement=[0]) assert blk.shape == (1, 3) assert blk.values.shape == (1, 3) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 2265522bc7ecb..ce88bae6e02f2 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1383,11 +1383,9 @@ def test_validate_ndim(): values = np.array([1.0, 2.0]) placement = BlockPlacement(slice(2)) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" - depr_msg = "make_block is deprecated" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(DeprecationWarning, match=depr_msg): - make_block(values, placement, ndim=2) + make_block(values, placement, ndim=2) def test_block_shape(): @@ -1402,12 +1400,8 @@ def test_make_block_no_pandas_array(block_maker): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.NumpyExtensionArray(np.array([1, 2])) - warn = None if block_maker is not make_block else DeprecationWarning - msg = "make_block is deprecated and will be removed in a future version" - # NumpyExtensionArray, no dtype - with tm.assert_produces_warning(warn, match=msg): - result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) + result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] if block_maker is make_block: @@ -1415,16 +1409,14 @@ def test_make_block_no_pandas_array(block_maker): assert result.is_extension is False # NumpyExtensionArray, NumpyEADtype - with tm.assert_produces_warning(warn, match=msg): - result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # new_block no longer taked dtype keyword # ndarray, NumpyEADtype - with tm.assert_produces_warning(warn, match=msg): - result = block_maker( - arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim - ) + result = block_maker( + arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim + ) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 9660b283a491b..d5dc723e2c7c5 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -233,7 +233,6 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float -@pytest.mark.filterwarnings("ignore:make_block is deprecated:FutureWarning") def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index d8f362039ba13..623657b412682 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -33,12 +33,9 @@ from pandas.io.parsers import read_csv -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning"), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") From b7fa3b9bbc7b2474beb106f6b0da6c90c5caab20 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 11 Jan 2024 03:23:04 +0100 Subject: [PATCH 044/396] Backport PR #56818 on branch 2.2.x (CI: Fix failing builds) (#56819) Backport PR #56818: CI: Fix failing builds Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/tests/io/parser/common/test_chunksize.py | 17 ++++------------- .../tests/io/parser/common/test_read_errors.py | 17 +++-------------- 2 files changed, 7 insertions(+), 27 deletions(-) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index d5dc723e2c7c5..9f42cf674b0a7 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -220,14 +220,9 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) # Coercions should work without warnings. - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - depr_msg = "Passing a BlockManager to DataFrame|make_block is deprecated" - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - with monkeypatch.context() as m: - m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) - result = parser.read_csv(StringIO(data)) + with monkeypatch.context() as m: + m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) + result = parser.read_csv(StringIO(data)) assert type(result.a[0]) is np.float64 assert result.a.dtype == float @@ -251,12 +246,8 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): buf = StringIO(data) if parser.engine == "pyarrow": - df = parser.read_csv_check_warnings( - DeprecationWarning, - "Passing a BlockManager to DataFrame is deprecated|" - "make_block is deprecated", + df = parser.read_csv( buf, - check_stacklevel=False, ) else: df = parser.read_csv_check_warnings( diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index db8b586d22fc0..f5a724bad4fa2 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -130,14 +130,9 @@ def test_catch_too_many_names(all_parsers): else "Number of passed names did not match " "number of header fields in the file" ) - depr_msg = "Passing a BlockManager to DataFrame is deprecated" - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) @skip_pyarrow # CSV parse error: Empty CSV file or block @@ -168,13 +163,7 @@ def test_suppress_error_output(all_parsers): data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - warn = None - if parser.engine == "pyarrow": - warn = DeprecationWarning - msg = "Passing a BlockManager to DataFrame|make_block is deprecated" - - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): - result = parser.read_csv(StringIO(data), on_bad_lines="skip") + result = parser.read_csv(StringIO(data), on_bad_lines="skip") tm.assert_frame_equal(result, expected) From 1c34627fd911c1a1cdeb62a4cee4e67e7699dc68 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 11 Jan 2024 05:29:35 +0100 Subject: [PATCH 045/396] Backport PR #55327 on branch 2.2.x (COMPAT: Fix warning with numba >= 0.58.0) (#56820) Backport PR #55327: COMPAT: Fix warning with numba >= 0.58.0 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/util/numba_.py | 9 +++++++++ pandas/tests/window/test_numba.py | 7 +++++++ 3 files changed, 17 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 4265447f05b8b..5de5bd58bd35f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -944,6 +944,7 @@ Other - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) +- Fixed a spurious deprecation warning from ``numba`` >= 0.58.0 when passing a numpy ufunc in :class:`pandas.core.window.Rolling.apply` with ``engine="numba"`` (:issue:`55247`) .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index b8d489179338b..4825c9fee24b1 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -1,11 +1,14 @@ """Common utilities for Numba operations""" from __future__ import annotations +import types from typing import ( TYPE_CHECKING, Callable, ) +import numpy as np + from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError @@ -83,6 +86,12 @@ def jit_user_function(func: Callable) -> Callable: if numba.extending.is_jitted(func): # Don't jit a user passed jitted function numba_func = func + elif getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + # Not necessary to jit builtins or np functions + # This will mess up register_jitable + numba_func = func else: numba_func = numba.extending.register_jitable(func) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index b1cc7ec186f19..139e1ff7f65fd 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -446,3 +446,10 @@ def test_table_method_ewm(self, data, method, axis, nogil, parallel, nopython): engine_kwargs=engine_kwargs, engine="numba" ) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_npfunc_no_warnings(): + df = DataFrame({"col1": [1, 2, 3, 4, 5]}) + with tm.assert_produces_warning(False): + df.col1.rolling(2).apply(np.prod, raw=True, engine="numba") From d11887b1f908ee8391103176e9c55b798bdc3be6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 13 Jan 2024 22:02:29 +0100 Subject: [PATCH 046/396] Backport PR #56849 on branch 2.2.x (REGR: freq "m" (as alias of deprecated "M") raises an error) (#56862) Backport PR #56849: REGR: freq "m" (as alias of deprecated "M") raises an error Co-authored-by: Marco Edward Gorelli --- pandas/_libs/tslibs/offsets.pyx | 6 +++--- pandas/tests/indexes/datetimes/test_date_range.py | 11 +++++++++++ pandas/tests/tslibs/test_to_offset.py | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 3a339171d0da2..205ab6f01f8c6 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4845,15 +4845,15 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): - if is_period is False and name in c_OFFSET_DEPR_FREQSTR: + if is_period is False and name.upper() in c_OFFSET_DEPR_FREQSTR: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " - f"\'{c_OFFSET_DEPR_FREQSTR.get(name)}\' instead.", + f"\'{c_OFFSET_DEPR_FREQSTR.get(name.upper())}\' instead.", FutureWarning, stacklevel=find_stack_level(), ) - name = c_OFFSET_DEPR_FREQSTR[name] + name = c_OFFSET_DEPR_FREQSTR[name.upper()] if is_period is True and name in c_REVERSE_OFFSET_DEPR_FREQSTR: if name.startswith("Y"): raise ValueError( diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 44dd64e162413..d26bee80003e9 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -822,6 +822,17 @@ def test_frequencies_A_deprecated_Y_renamed(self, freq, freq_depr): result = date_range("1/1/2000", periods=2, freq=freq_depr) tm.assert_index_equal(result, expected) + def test_to_offset_with_lowercase_deprecated_freq(self) -> None: + # https://github.com/pandas-dev/pandas/issues/56847 + msg = ( + "'m' is deprecated and will be removed in a future version, please use " + "'ME' instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = date_range("2010-01-01", periods=2, freq="m") + expected = DatetimeIndex(["2010-01-31", "2010-02-28"], freq="ME") + tm.assert_index_equal(result, expected) + def test_date_range_bday(self): sdate = datetime(1999, 12, 25) idx = date_range(start=sdate, freq="1B", periods=20) diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index ef68408305232..6e654e65a36d6 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -45,6 +45,7 @@ def test_to_offset_negative(freqstr, expected): assert result.n == expected +@pytest.mark.filterwarnings("ignore:.*'m' is deprecated.*:FutureWarning") @pytest.mark.parametrize( "freqstr", [ From b9dd271d170002e1ddaa44ed76f665204d897acb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 15 Jan 2024 01:23:47 +0100 Subject: [PATCH 047/396] Backport PR #56873 on branch 2.2.x (CI: unxfail adbc-driver-postgresql test) (#56875) Backport PR #56873: CI: unxfail adbc-driver-postgresql test Co-authored-by: Marco Edward Gorelli --- pandas/tests/io/test_sql.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6645aefd4f0a7..791b6da3deeca 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2229,12 +2229,14 @@ def test_api_chunksize_read(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_categorical(conn, request): if conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.mark.xfail( - reason="categorical dtype not implemented for ADBC postgres driver", - strict=True, + adbc = import_optional_dependency("adbc_driver_postgresql", errors="ignore") + if adbc is not None and Version(adbc.__version__) < Version("0.9.0"): + request.node.add_marker( + pytest.mark.xfail( + reason="categorical dtype not implemented for ADBC postgres driver", + strict=True, + ) ) - ) # GH8624 # test that categorical gets written correctly as dense column conn = request.getfixturevalue(conn) From 8e25417474e6d015f70d6b22463648a5b7371470 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 15 Jan 2024 20:57:41 +0100 Subject: [PATCH 048/396] Backport PR #56891 on branch 2.2.x (DOC: Add deprecated markers for downcast keyword) (#56893) Backport PR #56891: DOC: Add deprecated markers for downcast keyword Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index de25a02c6b37c..f8728c61e46fc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7187,6 +7187,8 @@ def fillna( or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None @@ -7522,6 +7524,8 @@ def ffill( or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None @@ -7713,6 +7717,8 @@ def bfill( or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None From 55c9fbd382cc65d896088927c56e811373308a47 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 16 Jan 2024 22:54:47 +0100 Subject: [PATCH 049/396] Backport PR #56906 on branch 2.2.x (DEPR: freq ''2BQ-SEP" for to_period should raise an error) (#56916) Backport PR #56906: DEPR: freq ''2BQ-SEP" for to_period should raise an error Co-authored-by: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> --- pandas/core/arrays/datetimes.py | 11 +++++------ .../indexes/datetimes/methods/test_to_period.py | 17 ----------------- .../tests/indexes/period/test_constructors.py | 9 +++++++++ 3 files changed, 14 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6b7ddc4a72957..02656b655a0c6 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -39,10 +39,7 @@ tz_convert_from_utc, tzconversion, ) -from pandas._libs.tslibs.dtypes import ( - abbrev_to_npy_unit, - freq_to_period_freqstr, -) +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_inclusive @@ -1232,8 +1229,10 @@ def to_period(self, freq=None) -> PeriodArray: if freq is None: freq = self.freqstr or self.inferred_freq - if isinstance(self.freq, BaseOffset): - freq = freq_to_period_freqstr(self.freq.n, self.freq.name) + if isinstance(self.freq, BaseOffset) and hasattr( + self.freq, "_period_dtype_code" + ): + freq = PeriodDtype(self.freq)._freqstr if freq is None: raise ValueError( diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 42a3f3b0f7b42..00c0216a9b3b5 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -111,23 +111,6 @@ def test_to_period_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): with tm.assert_produces_warning(FutureWarning, match=msg): assert prng.freq == freq_depr - @pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2BQE-SEP", "2BQ-SEP"), - ("2BYE-MAR", "2BY-MAR"), - ], - ) - def test_to_period_frequency_BQ_BY_deprecated(self, freq, freq_depr): - # GH#9586 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - rng = date_range("01-Jan-2012", periods=8, freq=freq) - prng = rng.to_period() - with tm.assert_produces_warning(FutureWarning, match=msg): - prng.freq == freq_depr - def test_to_period_infer(self): # https://github.com/pandas-dev/pandas/issues/33358 rng = date_range( diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 387dc47c48d20..0cd4ddef5a3c9 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -48,6 +48,15 @@ def test_period_index_frequency_invalid_freq(self, freq_depr): with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + @pytest.mark.parametrize("freq", ["2BQE-SEP", "2BYE-MAR", "2BME"]) + def test_period_index_from_datetime_index_invalid_freq(self, freq): + # GH#56899 + msg = f"Invalid frequency: {freq[1:]}" + + rng = date_range("01-Jan-2012", periods=8, freq=freq) + with pytest.raises(ValueError, match=msg): + rng.to_period() + class TestPeriodIndex: def test_from_ordinals(self): From d7dd696176baa62411b873e6b4487ee309caa227 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 17 Jan 2024 12:00:40 +0100 Subject: [PATCH 050/396] Backport PR #56910 on branch 2.2.x (DEPR: lowercase freqs 'ye', 'qe', etc. raise a ValueError) (#56924) Backport PR #56910: DEPR: lowercase freqs 'ye', 'qe', etc. raise a ValueError Co-authored-by: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> --- pandas/_libs/tslibs/offsets.pyx | 91 ++++++++++++------- pandas/tests/arrays/test_datetimes.py | 42 +++++++++ .../indexes/datetimes/test_partial_slicing.py | 2 +- .../tests/indexes/period/test_constructors.py | 15 ++- .../tests/indexes/period/test_period_range.py | 44 ++++++--- pandas/tests/resample/test_period_index.py | 29 ++++++ pandas/tests/scalar/period/test_period.py | 4 +- pandas/tests/tslibs/test_to_offset.py | 44 +++++++++ 8 files changed, 218 insertions(+), 53 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 205ab6f01f8c6..764a044f32c82 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4711,29 +4711,7 @@ _lite_rule_alias = { "ns": "ns", } -_dont_uppercase = { - "h", - "bh", - "cbh", - "MS", - "ms", - "s", - "me", - "qe", - "qe-dec", - "qe-jan", - "qe-feb", - "qe-mar", - "qe-apr", - "qe-may", - "qe-jun", - "qe-jul", - "qe-aug", - "qe-sep", - "qe-oct", - "qe-nov", - "ye", -} +_dont_uppercase = _dont_uppercase = {"h", "bh", "cbh", "MS", "ms", "s"} INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" @@ -4752,7 +4730,29 @@ def _get_offset(name: str) -> BaseOffset: -------- _get_offset('EOM') --> BMonthEnd(1) """ - if name.lower() not in _dont_uppercase: + if ( + name not in _lite_rule_alias + and (name.upper() in _lite_rule_alias) + and name != "ms" + ): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use \'{name.upper()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif ( + name not in _lite_rule_alias + and (name.lower() in _lite_rule_alias) + and name != "MS" + ): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use \'{name.lower()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if name not in _dont_uppercase: name = name.upper() name = _lite_rule_alias.get(name, name) name = _lite_rule_alias.get(name.lower(), name) @@ -4845,7 +4845,7 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): - if is_period is False and name.upper() in c_OFFSET_DEPR_FREQSTR: + if not is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: warnings.warn( f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " @@ -4854,31 +4854,52 @@ cpdef to_offset(freq, bint is_period=False): stacklevel=find_stack_level(), ) name = c_OFFSET_DEPR_FREQSTR[name.upper()] - if is_period is True and name in c_REVERSE_OFFSET_DEPR_FREQSTR: - if name.startswith("Y"): + if (not is_period and + name != name.upper() and + name.lower() not in {"s", "ms", "us", "ns"} and + name.upper().split("-")[0].endswith(("S", "E"))): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{name.upper()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + name = name.upper() + if is_period and name.upper() in c_REVERSE_OFFSET_DEPR_FREQSTR: + if name.upper().startswith("Y"): raise ValueError( - f"for Period, please use \'Y{name[2:]}\' " + f"for Period, please use \'Y{name.upper()[2:]}\' " f"instead of \'{name}\'" ) - if (name.startswith("B") or - name.startswith("S") or name.startswith("C")): + if (name.upper().startswith("B") or + name.upper().startswith("S") or + name.upper().startswith("C")): raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) else: raise ValueError( f"for Period, please use " - f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name)}\' " + f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name.upper())}\' " f"instead of \'{name}\'" ) - elif is_period is True and name in c_OFFSET_DEPR_FREQSTR: - if name.startswith("A"): + elif is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: + if name.upper().startswith("A"): warnings.warn( f"\'{name}\' is deprecated and will be removed in a future " - f"version, please use \'{c_DEPR_ABBREVS.get(name)}\' " + f"version, please use " + f"\'{c_DEPR_ABBREVS.get(name.upper())}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if name.upper() != name: + warnings.warn( + f"\'{name}\' is deprecated and will be removed in " + f"a future version, please use \'{name.upper()}\' " f"instead.", FutureWarning, stacklevel=find_stack_level(), ) - name = c_OFFSET_DEPR_FREQSTR.get(name) + name = c_OFFSET_DEPR_FREQSTR.get(name.upper()) if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 9a576be10d5ca..8f0576cc65a27 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -766,12 +766,18 @@ def test_iter_zoneinfo_fold(self, tz): "freq, freq_depr", [ ("2ME", "2M"), + ("2SME", "2SM"), + ("2SME", "2sm"), ("2QE", "2Q"), ("2QE-SEP", "2Q-SEP"), ("1YE", "1Y"), ("2YE-MAR", "2Y-MAR"), ("1YE", "1A"), ("2YE-MAR", "2A-MAR"), + ("2ME", "2m"), + ("2QE-SEP", "2q-sep"), + ("2YE-MAR", "2a-mar"), + ("2YE", "2y"), ], ) def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): @@ -784,6 +790,42 @@ def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("freq_depr", ["2H", "2CBH", "2MIN", "2S", "2mS", "2Us"]) + def test_date_range_uppercase_frequency_deprecated(self, freq_depr): + # GH#9586, GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.lower()[1:]}' instead." + + expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.lower()) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq_depr", + [ + "2ye-mar", + "2ys", + "2qe", + "2qs-feb", + "2bqs", + "2sms", + "2bms", + "2cbme", + "2me", + "2w", + ], + ) + def test_date_range_lowercase_frequency_deprecated(self, freq_depr): + # GH#9586, GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.upper()[1:]}' instead." + + expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.upper()) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + def test_factorize_sort_without_freq(): dta = DatetimeArray._from_sequence([0, 2, 1], dtype="M8[ns]") diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 0ebb88afb6c86..8b493fc61cb58 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -236,7 +236,7 @@ def test_partial_slice_second_precision(self): rng = date_range( start=datetime(2005, 1, 1, 0, 0, 59, microsecond=999990), periods=20, - freq="US", + freq="us", ) s = Series(np.arange(20), rng) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 0cd4ddef5a3c9..892eb7b4a00d1 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -26,9 +26,12 @@ class TestPeriodIndexDisallowedFreqs: ("2M", "2ME"), ("2Q-MAR", "2QE-MAR"), ("2Y-FEB", "2YE-FEB"), + ("2M", "2me"), + ("2Q-MAR", "2qe-MAR"), + ("2Y-FEB", "2yE-feb"), ], ) - def test_period_index_frequency_ME_error_message(self, freq, freq_depr): + def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): # GH#52064 msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" @@ -38,7 +41,7 @@ def test_period_index_frequency_ME_error_message(self, freq, freq_depr): with pytest.raises(ValueError, match=msg): period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) - @pytest.mark.parametrize("freq_depr", ["2SME", "2CBME", "2BYE"]) + @pytest.mark.parametrize("freq_depr", ["2SME", "2sme", "2CBME", "2BYE", "2Bye"]) def test_period_index_frequency_invalid_freq(self, freq_depr): # GH#9586 msg = f"Invalid frequency: {freq_depr[1:]}" @@ -547,7 +550,9 @@ def test_period_range_length(self): assert i1.freq == end_intv.freq assert i1[-1] == end_intv - end_intv = Period("2006-12-31", "1w") + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + end_intv = Period("2006-12-31", "1w") i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() @@ -576,7 +581,9 @@ def test_mixed_freq_raises(self): with tm.assert_produces_warning(FutureWarning, match=msg): end_intv = Period("2005-05-01", "B") - vals = [end_intv, Period("2006-12-31", "w")] + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + vals = [end_intv, Period("2006-12-31", "w")] msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" depr_msg = r"PeriodDtype\[B\] is deprecated" with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 2543b49089948..6f8e6d07da8bf 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -181,7 +181,9 @@ def test_construction_from_period(self): def test_mismatched_start_end_freq_raises(self): depr_msg = "Period with BDay freq is deprecated" - end_w = Period("2006-12-31", "1w") + msg = "'w' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + end_w = Period("2006-12-31", "1w") with tm.assert_produces_warning(FutureWarning, match=depr_msg): start_b = Period("02-Apr-2005", "B") @@ -203,19 +205,37 @@ def test_constructor_U(self): with pytest.raises(ValueError, match="Invalid frequency: X"): period_range("2007-1-1", periods=500, freq="X") - def test_H_deprecated_from_time_series(self): + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2Y", "2A"), + ("2Y", "2a"), + ("2Y-AUG", "2A-AUG"), + ("2Y-AUG", "2A-aug"), + ], + ) + def test_a_deprecated_from_time_series(self, freq, freq_depr): # GH#52536 - msg = "'H' is deprecated and will be removed in a future version." + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=msg): + period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") + + @pytest.mark.parametrize("freq_depr", ["2H", "2MIN", "2S", "2US", "2NS"]) + def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): + # GH#52536, GH#54939 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.lower()[1:]}' instead." + with tm.assert_produces_warning(FutureWarning, match=msg): - period_range(freq="2H", start="1/1/2001", end="12/1/2009") + period_range("2020-01-01 00:00:00 00:00", periods=2, freq=freq_depr) + + @pytest.mark.parametrize("freq_depr", ["2m", "2q-sep", "2y", "2w"]) + def test_lowercase_freq_deprecated_from_time_series(self, freq_depr): + # GH#52536, GH#54939 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.upper()[1:]}' instead." - @pytest.mark.parametrize("freq_depr", ["2A", "A-DEC", "200A-AUG"]) - def test_a_deprecated_from_time_series(self, freq_depr): - # GH#52536 - freq_msg = freq_depr[freq_depr.index("A") :] - msg = ( - f"'{freq_msg}' is deprecated and will be removed in a future version, " - f"please use 'Y{freq_msg[1:]}' instead." - ) with tm.assert_produces_warning(FutureWarning, match=msg): period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index eb80f56dd7d4b..451d2a83c1d5e 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1006,6 +1006,32 @@ def test_resample_t_l_deprecated(self): result = ser.resample("T").mean() tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "freq, freq_depr, freq_res, freq_depr_res, data", + [ + ("2Q", "2q", "2Y", "2y", [0.5]), + ("2M", "2m", "2Q", "2q", [1.0, 3.0]), + ], + ) + def test_resample_lowercase_frequency_deprecated( + self, freq, freq_depr, freq_res, freq_depr_res, data + ): + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq[1:]}' instead." + depr_msg_res = f"'{freq_depr_res[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_res[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + rng_l = period_range("2020-01-01", "2020-08-01", freq=freq_depr) + ser = Series(np.arange(len(rng_l)), index=rng_l) + + rng = period_range("2020-01-01", "2020-08-01", freq=freq_res) + expected = Series(data=data, index=rng) + + with tm.assert_produces_warning(FutureWarning, match=depr_msg_res): + result = ser.resample(freq_depr_res).mean() + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "offset", [ @@ -1031,6 +1057,9 @@ def test_asfreq_invalid_period_freq(self, offset, series_and_frame): ("2Q-FEB", "2QE-FEB"), ("2Y", "2YE"), ("2Y-MAR", "2YE-MAR"), + ("2M", "2me"), + ("2Q", "2qe"), + ("2Y-MAR", "2ye-mar"), ], ) def test_resample_frequency_ME_QE_YE_error_message(series_and_frame, freq, freq_depr): diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index aa4a8b152b19f..d819e903a0bae 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -106,7 +106,9 @@ def test_construction(self): assert i1 == i3 i1 = Period("1982", freq="min") - i2 = Period("1982", freq="MIN") + msg = "'MIN' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + i2 = Period("1982", freq="MIN") assert i1 == i2 i1 = Period(year=2005, month=3, day=1, freq="D") diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index 6e654e65a36d6..8ca55648f3780 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -173,3 +173,47 @@ def test_to_offset_pd_timedelta(kwargs, expected): def test_anchored_shortcuts(shortcut, expected): result = to_offset(shortcut) assert result == expected + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2ye-mar", + "2ys", + "2qe", + "2qs-feb", + "2bqs", + "2sms", + "2bms", + "2cbme", + "2me", + "2w", + ], +) +def test_to_offset_lowercase_frequency_deprecated(freq_depr): + # GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.upper()[1:]}' instead." + + with pytest.raises(FutureWarning, match=depr_msg): + to_offset(freq_depr) + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2H", + "2BH", + "2MIN", + "2S", + "2Us", + "2NS", + ], +) +def test_to_offset_uppercase_frequency_deprecated(freq_depr): + # GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.lower()[1:]}' instead." + + with pytest.raises(FutureWarning, match=depr_msg): + to_offset(freq_depr) From 797cbb7c3cfb7b4a97240603828e1d1641267147 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 17 Jan 2024 19:05:16 +0100 Subject: [PATCH 051/396] Backport PR #56930 on branch 2.2.x (DOC: update install instruction with correct Python version support (including 3.12)) (#56931) Backport PR #56930: DOC: update install instruction with correct Python version support (including 3.12) Co-authored-by: Joris Van den Bossche --- doc/source/getting_started/install.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 1d7eca5223544..b9f7d64d4b2f8 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -21,7 +21,7 @@ Instructions for installing :ref:`from source `, Python version support ---------------------- -Officially Python 3.9, 3.10 and 3.11. +Officially Python 3.9, 3.10, 3.11 and 3.12. Installing pandas ----------------- From 988c3a4feefeeaa53a7fee76c7c977010b53c6c8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 17 Jan 2024 23:29:35 +0100 Subject: [PATCH 052/396] Backport PR #56824 on branch 2.2.x (DOC: 2.2.0 whatsnew cleanups) (#56933) Backport PR #56824: DOC: 2.2.0 whatsnew cleanups Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.1.4.rst | 2 +- doc/source/whatsnew/v2.2.0.rst | 24 ++++++++++-------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 57b83a294963b..73b1103c1bd37 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -42,4 +42,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v2.1.3..v2.1.4|HEAD +.. contributors:: v2.1.3..v2.1.4 diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 5de5bd58bd35f..ceb67b4ef956c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_220: -What's new in 2.2.0 (Month XX, 2024) ------------------------------------- +What's new in 2.2.0 (January XX, 2024) +-------------------------------------- These are the changes in pandas 2.2.0. See :ref:`release` for a full changelog including other versions of pandas. @@ -436,12 +436,6 @@ index levels when joining on two indexes with different levels (:issue:`34133`). result -.. --------------------------------------------------------------------------- -.. _whatsnew_220.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - .. _whatsnew_220.api_breaking.deps: Increased minimum versions for dependencies @@ -820,7 +814,7 @@ Conversion - Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) - Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`) - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) -- Bug in ``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`) +- Bug in :meth:``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`) Strings ^^^^^^^ @@ -830,10 +824,10 @@ Strings - Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) +- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) -- Bug in :meth:`str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`) - Bug in comparison operations for ``dtype="string[pyarrow_numpy]"`` raising if dtypes can't be compared (:issue:`56008`) Interval @@ -892,7 +886,6 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :class:`.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`) - Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) @@ -906,6 +899,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) +- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where either the ``index`` or ``on`` column was :class:`ArrowDtype` with ``pyarrow.timestamp`` type (:issue:`55849`) Reshaping @@ -927,16 +921,16 @@ Reshaping Sparse ^^^^^^ -- Bug in :meth:`SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) +- Bug in :meth:`arrays.SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) Other ^^^^^ - :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`) - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`) +- Bug in :func:`api.interchange.from_dataframe` where it raised ``NotImplementedError`` when handling empty string columns (:issue:`56703`) - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`) - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`) -- Bug in :func:`pd.api.interchange.from_dataframe` where it raised ``NotImplementedError`` when handling empty string columns (:issue:`56703`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`) @@ -944,10 +938,12 @@ Other - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) -- Fixed a spurious deprecation warning from ``numba`` >= 0.58.0 when passing a numpy ufunc in :class:`pandas.core.window.Rolling.apply` with ``engine="numba"`` (:issue:`55247`) +- Fixed a spurious deprecation warning from ``numba`` >= 0.58.0 when passing a numpy ufunc in :class:`core.window.Rolling.apply` with ``engine="numba"`` (:issue:`55247`) .. --------------------------------------------------------------------------- .. _whatsnew_220.contributors: Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.1.4..v2.2.0|HEAD From 74fa7402e487c2a6336ea5291990c9f269c5001a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 18 Jan 2024 07:21:14 -0800 Subject: [PATCH 053/396] Backport PR #56445: Adjust merge tests for new string option (#56938) Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/reshape/merge/test_join.py | 4 +- pandas/tests/reshape/merge/test_merge.py | 46 +++++++++++-------- pandas/tests/reshape/merge/test_merge_asof.py | 15 ++++-- pandas/tests/reshape/merge/test_multi.py | 2 +- 4 files changed, 42 insertions(+), 25 deletions(-) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 1d5ed2d7373ce..9a2f18f33bce5 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -631,7 +631,7 @@ def test_mixed_type_join_with_suffix(self): df.insert(5, "dt", "foo") grouped = df.groupby("id") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) @@ -776,7 +776,7 @@ def test_join_on_tz_aware_datetimeindex(self): ) result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() - expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object) + expected["vals_2"] = Series([np.nan] * 2 + list("tuv")) tm.assert_frame_equal(result, expected) def test_join_datetime_string(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 27959609422f3..ed49f3b758cc5 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_object_dtype +from pandas.core.dtypes.common import ( + is_object_dtype, + is_string_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -316,14 +319,15 @@ def test_merge_copy(self): merged["d"] = "peekaboo" assert (right["d"] == "bar").all() - def test_merge_nocopy(self, using_array_manager): + def test_merge_nocopy(self, using_array_manager, using_infer_string): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) merged = merge(left, right, left_index=True, right_index=True, copy=False) assert np.shares_memory(merged["a"]._values, left["a"]._values) - assert np.shares_memory(merged["d"]._values, right["d"]._values) + if not using_infer_string: + assert np.shares_memory(merged["d"]._values, right["d"]._values) def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -667,11 +671,13 @@ def test_merge_nan_right(self): "i1_": {0: 0, 1: np.nan}, "i3": {0: 0.0, 1: np.nan}, None: {0: 0, 1: 0}, - } + }, + columns=Index(["i1", "i2", "i1_", "i3", None], dtype=object), ) .set_index(None) .reset_index()[["i1", "i2", "i1_", "i3"]] ) + result.columns = result.columns.astype("object") tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_nan_right2(self): @@ -820,7 +826,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|string'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -1498,7 +1504,7 @@ def test_different(self, right_vals): # We allow merging on object and categorical cols and cast # categorical cols to object result = merge(left, right, on="A") - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8] @@ -1637,7 +1643,7 @@ def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): result = merge(df1, df2, on=["A"]) assert is_object_dtype(result.A.dtype) result = merge(df2, df1, on=["A"]) - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( "df1_vals, df2_vals", @@ -1867,25 +1873,27 @@ def right(): class TestMergeCategorical: - def test_identical(self, left): + def test_identical(self, left, using_infer_string): # merging on the same, should preserve dtypes merged = merge(left, left, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( - [CategoricalDtype(categories=["foo", "bar"]), np.dtype("O"), np.dtype("O")], + [CategoricalDtype(categories=["foo", "bar"]), dtype, dtype], index=["X", "Y_x", "Y_y"], ) tm.assert_series_equal(result, expected) - def test_basic(self, left, right): + def test_basic(self, left, right, using_infer_string): # we have matching Categorical dtypes in X # so should preserve the merged column merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, np.dtype("int64"), ], index=["X", "Y", "Z"], @@ -1989,16 +1997,17 @@ def test_multiindex_merge_with_unordered_categoricalindex(self, ordered): ).set_index(["id", "p"]) tm.assert_frame_equal(result, expected) - def test_other_columns(self, left, right): + def test_other_columns(self, left, right, using_infer_string): # non-merge columns should preserve if possible right = right.assign(Z=right.Z.astype("category")) merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, CategoricalDtype(categories=[1, 2]), ], index=["X", "Y", "Z"], @@ -2017,7 +2026,9 @@ def test_other_columns(self, left, right): lambda x: x.astype(CategoricalDtype(ordered=True)), ], ) - def test_dtype_on_merged_different(self, change, join_type, left, right): + def test_dtype_on_merged_different( + self, change, join_type, left, right, using_infer_string + ): # our merging columns, X now has 2 different dtypes # so we must be object as a result @@ -2029,9 +2040,8 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): merged = merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - expected = Series( - [np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"] - ) + dtype = np.dtype("O") if not using_infer_string else "string" + expected = Series([dtype, dtype, np.dtype("int64")], index=["X", "Y", "Z"]) tm.assert_series_equal(result, expected) def test_self_join_multiple_categories(self): @@ -2499,7 +2509,7 @@ def test_merge_multiindex_columns(): expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"]) expected = DataFrame(columns=expected_index) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_datetime_upcast_dtype(): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index b656191cc739d..a2e22ea73fd86 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3081,8 +3081,11 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) - def test_merge_datatype_error_raises(self): - msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" + def test_merge_datatype_error_raises(self, using_infer_string): + if using_infer_string: + msg = "incompatible merge keys" + else: + msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) @@ -3134,7 +3137,7 @@ def test_merge_on_nans(self, func, side): else: merge_asof(df, df_null, on="a") - def test_by_nullable(self, any_numeric_ea_dtype): + def test_by_nullable(self, any_numeric_ea_dtype, using_infer_string): # Note: this test passes if instead of using pd.array we use # np.array([np.nan, 1]). Other than that, I (@jbrockmendel) # have NO IDEA what the expected behavior is. @@ -3176,6 +3179,8 @@ def test_by_nullable(self, any_numeric_ea_dtype): } ) expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) def test_merge_by_col_tz_aware(self): @@ -3201,7 +3206,7 @@ def test_merge_by_col_tz_aware(self): ) tm.assert_frame_equal(result, expected) - def test_by_mixed_tz_aware(self): + def test_by_mixed_tz_aware(self, using_infer_string): # GH 26649 left = pd.DataFrame( { @@ -3225,6 +3230,8 @@ def test_by_mixed_tz_aware(self): columns=["by_col1", "by_col2", "on_col", "value_x"], ) expected["value_y"] = np.array([np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["float64", "int16", "m8[ns]", "M8[us]"]) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index b1aa6b88bc4ee..402ff049884ba 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -639,7 +639,7 @@ def test_join_multi_levels_outer(self, portfolio, household, expected): axis=0, sort=True, ).reindex(columns=expected.columns) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_join_multi_levels_invalid(self, portfolio, household): portfolio = portfolio.copy() From 160d7a154f61fb55611626c311a1b0216828dea8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 18 Jan 2024 21:47:02 +0100 Subject: [PATCH 054/396] Backport PR #56587 on branch 2.2.x (ENH: support the Arrow PyCapsule Interface on pandas.DataFrame (export)) (#56944) Backport PR #56587: ENH: support the Arrow PyCapsule Interface on pandas.DataFrame (export) Co-authored-by: Joris Van den Bossche --- pandas/compat/_optional.py | 5 ++- pandas/core/frame.py | 27 +++++++++++++ pandas/tests/frame/test_arrow_interface.py | 45 ++++++++++++++++++++++ pandas/tests/test_optional_dependency.py | 14 +++++++ 4 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/frame/test_arrow_interface.py diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 9d04d7c0a1216..2bc6cd46f09a7 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -120,9 +120,8 @@ def import_optional_dependency( The imported module, when found and the version is correct. None is returned when the package is not found and `errors` is False, or when the package's version is too old and `errors` - is ``'warn'``. + is ``'warn'`` or ``'ignore'``. """ - assert errors in {"warn", "raise", "ignore"} package_name = INSTALL_MAPPING.get(name) @@ -163,5 +162,7 @@ def import_optional_dependency( return None elif errors == "raise": raise ImportError(msg) + else: + return None return module diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 15ccbd602c9c8..734756cb8f7c8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -987,6 +987,33 @@ def __dataframe_consortium_standard__( ) return convert_to_standard_compliant_dataframe(self, api_version=api_version) + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if requested_schema is not None: + requested_schema = pa.Schema._import_from_c_capsule(requested_schema) + table = pa.Table.from_pandas(self, schema=requested_schema) + return table.__arrow_c_stream__() + # ---------------------------------------------------------------------- @property diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py new file mode 100644 index 0000000000000..ac7b51cbdfa92 --- /dev/null +++ b/pandas/tests/frame/test_arrow_interface.py @@ -0,0 +1,45 @@ +import ctypes + +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd + +pa = pytest.importorskip("pyarrow") + + +@td.skip_if_no("pyarrow", min_version="14.0") +def test_dataframe_arrow_interface(): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + capsule = df.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + table = pa.table(df) + expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.table(df, schema=schema) + expected = expected.cast(schema) + assert table.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="15.0") +def test_dataframe_to_arrow(): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + table = pa.RecordBatchReader.from_stream(df) + expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.RecordBatchReader.from_stream(df, schema=schema) + expected = expected.cast(schema) + assert table.equals(expected) diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index c1d1948d6c31a..52b5f636b1254 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -50,6 +50,20 @@ def test_bad_version(monkeypatch): result = import_optional_dependency("fakemodule") assert result is module + with pytest.raises(ImportError, match="Pandas requires version '1.1.0'"): + import_optional_dependency("fakemodule", min_version="1.1.0") + + with tm.assert_produces_warning(UserWarning): + result = import_optional_dependency( + "fakemodule", errors="warn", min_version="1.1.0" + ) + assert result is None + + result = import_optional_dependency( + "fakemodule", errors="ignore", min_version="1.1.0" + ) + assert result is None + def test_submodule(monkeypatch): # Create a fake module with a submodule From a95029a77f1c00678dda82f76a1b53b4b161b2a0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 19 Jan 2024 03:56:51 +0100 Subject: [PATCH 055/396] Backport PR #56947 on branch 2.2.x (DOC: Set date for 2.2) (#56950) Backport PR #56947: DOC: Set date for 2.2 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ceb67b4ef956c..d9ab0452c8334 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -1,6 +1,6 @@ .. _whatsnew_220: -What's new in 2.2.0 (January XX, 2024) +What's new in 2.2.0 (January 19, 2024) -------------------------------------- These are the changes in pandas 2.2.0. See :ref:`release` for a full changelog From cc37a13c5890a1d8fff7f8314008161b8fb1e21e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 19 Jan 2024 18:14:09 +0100 Subject: [PATCH 056/396] Backport PR #56949 on branch 2.2.x (CI: avoid FutureWarnings in to_xarray tests) (#56961) Backport PR #56949: CI: avoid FutureWarnings in to_xarray tests Co-authored-by: Luke Manley --- pandas/tests/generic/test_to_xarray.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index e0d79c3f15282..d8401a8b2ae3f 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -41,7 +41,7 @@ def test_to_xarray_index_types(self, index_flat, df, using_infer_string): df.index.name = "foo" df.columns.name = "bar" result = df.to_xarray() - assert result.dims["foo"] == 4 + assert result.sizes["foo"] == 4 assert len(result.coords) == 1 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) @@ -62,7 +62,7 @@ def test_to_xarray_empty(self, df): df.index.name = "foo" result = df[0:0].to_xarray() - assert result.dims["foo"] == 0 + assert result.sizes["foo"] == 0 assert isinstance(result, Dataset) def test_to_xarray_with_multiindex(self, df, using_infer_string): @@ -71,8 +71,8 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string): # MultiIndex df.index = MultiIndex.from_product([["a"], range(4)], names=["one", "two"]) result = df.to_xarray() - assert result.dims["one"] == 1 - assert result.dims["two"] == 4 + assert result.sizes["one"] == 1 + assert result.sizes["two"] == 4 assert len(result.coords) == 2 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) From 859c030f5ea378a0f83aeb1de0c6c0fa6b420604 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 19 Jan 2024 19:21:44 +0100 Subject: [PATCH 057/396] Backport PR #56922 on branch 2.2.x (REGR: DatetimeTZDtype __from_arrow__ interprets UTC values as wall time) (#56962) Backport PR #56922: REGR: DatetimeTZDtype __from_arrow__ interprets UTC values as wall time Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/core/dtypes/dtypes.py | 2 +- pandas/tests/arrays/datetimes/test_constructors.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e90e92fa0ee1c..1c43ef55c11d7 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -919,7 +919,7 @@ def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> DatetimeArray: else: np_arr = array.to_numpy() - return DatetimeArray._from_sequence(np_arr, dtype=self, copy=False) + return DatetimeArray._simple_new(np_arr, dtype=self) def __setstate__(self, state) -> None: # for pickle compat. __get_state__ is defined in the diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py index daf4aa3b47f56..3652b5fec46bb 100644 --- a/pandas/tests/arrays/datetimes/test_constructors.py +++ b/pandas/tests/arrays/datetimes/test_constructors.py @@ -223,7 +223,7 @@ def test_2d(self, order): ("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE), ], ) -def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_( +def test_from_arrow_with_different_units_and_timezones_with( pa_unit, pd_unit, pa_tz, pd_tz, data ): pa = pytest.importorskip("pyarrow") @@ -233,9 +233,8 @@ def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_( dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence( - np.array(data, dtype=f"datetime64[{pa_unit}]").astype(f"datetime64[{pd_unit}]"), - dtype=dtype, + expected = DatetimeArray._from_sequence(data, dtype=f"M8[{pa_unit}, UTC]").astype( + dtype, copy=False ) tm.assert_extension_array_equal(result, expected) From dfd0aeda19e475314bb874d46507269777795793 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 19 Jan 2024 19:22:16 +0100 Subject: [PATCH 058/396] Backport PR #56896 on branch 2.2.x (DEPS: Add warning if pyarrow is not installed) (#56963) Backport PR #56896: DEPS: Add warning if pyarrow is not installed Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 5 ++++- pandas/__init__.py | 33 ++++++++++++++++++++++++++++++-- pandas/compat/pyarrow.py | 2 ++ pandas/tests/test_common.py | 22 +++++++++++++++++++++ 4 files changed, 59 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index dd5d090e098b0..a3cffb4b03b93 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -92,7 +92,10 @@ jobs: - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" - test_args: "-W error::DeprecationWarning -W error::FutureWarning" + # Currently restricted the warnings that error to Deprecation Warnings from numpy + # done since pyarrow isn't compatible with numpydev always + # TODO: work with pyarrow to revert this? + test_args: "-W error::DeprecationWarning:numpy -W error::FutureWarning:numpy" - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" diff --git a/pandas/__init__.py b/pandas/__init__.py index 7fab662ed2de4..ed524c2bb3619 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -202,8 +202,37 @@ FutureWarning, stacklevel=2, ) -# Don't allow users to use pandas.os or pandas.warnings -del os, warnings + +# DeprecationWarning for missing pyarrow +from pandas.compat.pyarrow import pa_version_under10p1, pa_not_found + +if pa_version_under10p1: + # pyarrow is either too old or nonexistent, warn + from pandas.compat._optional import VERSIONS + + if pa_not_found: + pa_msg = "was not found to be installed on your system." + else: + pa_msg = ( + f"was too old on your system - pyarrow {VERSIONS['pyarrow']} " + "is the current minimum supported version as of this release." + ) + + warnings.warn( + f""" +Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), +(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) +but {pa_msg} +If this would cause problems for you, +please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 + """, # noqa: E501 + DeprecationWarning, + stacklevel=2, + ) + del VERSIONS, pa_msg + +# Delete all unnecessary imported modules +del pa_version_under10p1, pa_not_found, warnings, os # module level doc-string __doc__ = """ diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index beb4814914101..2e151123ef2c9 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -8,6 +8,7 @@ import pyarrow as pa _palv = Version(Version(pa.__version__).base_version) + pa_not_found = False pa_version_under10p1 = _palv < Version("10.0.1") pa_version_under11p0 = _palv < Version("11.0.0") pa_version_under12p0 = _palv < Version("12.0.0") @@ -16,6 +17,7 @@ pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") except ImportError: + pa_not_found = True pa_version_under10p1 = True pa_version_under11p0 = True pa_version_under12p0 = True diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index e8a1c961c8cb6..fe24755e8cc23 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -8,6 +8,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import Series import pandas._testing as tm @@ -265,3 +267,23 @@ def test_bz2_missing_import(): code = textwrap.dedent(code) call = [sys.executable, "-c", code] subprocess.check_output(call) + + +@td.skip_if_installed("pyarrow") +@pytest.mark.parametrize("module", ["pandas", "pandas.arrays"]) +def test_pyarrow_missing_warn(module): + # GH56896 + response = subprocess.run( + [sys.executable, "-c", f"import {module}"], + capture_output=True, + check=True, + ) + msg = """ +Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), +(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) +but was not found to be installed on your system. +If this would cause problems for you, +please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 +""" # noqa: E501 + stderr_msg = response.stderr.decode("utf-8") + assert msg in stderr_msg, stderr_msg From b070774d9aa6e3ed0667c9ebdaa82ddce79a6d4b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 19 Jan 2024 12:05:34 -0800 Subject: [PATCH 059/396] =?UTF-8?q?Backport=20PR=20#56952:=20DEPR:=20Make?= =?UTF-8?q?=20FutureWarning=20into=20DeprecationWarning=20=E2=80=A6=20(#56?= =?UTF-8?q?964)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Backport PR #56952: DEPR: Make FutureWarning into DeprecationWarning for groupby.apply * Update test_groupby.py * fix finally? --------- Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/core/groupby/groupby.py | 2 +- pandas/core/resample.py | 2 +- pandas/tests/extension/base/groupby.py | 12 +- pandas/tests/frame/test_stack_unstack.py | 2 +- pandas/tests/groupby/aggregate/test_other.py | 4 +- .../groupby/methods/test_value_counts.py | 2 +- pandas/tests/groupby/test_apply.py | 132 +++++++++--------- pandas/tests/groupby/test_apply_mutate.py | 18 +-- pandas/tests/groupby/test_categorical.py | 6 +- pandas/tests/groupby/test_counting.py | 2 +- pandas/tests/groupby/test_groupby.py | 32 ++--- pandas/tests/groupby/test_groupby_dropna.py | 2 +- pandas/tests/groupby/test_groupby_subclass.py | 10 +- pandas/tests/groupby/test_grouping.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 10 +- .../tests/groupby/transform/test_transform.py | 4 +- pandas/tests/resample/test_datetime_index.py | 10 +- pandas/tests/resample/test_resample_api.py | 2 +- .../tests/resample/test_resampler_grouper.py | 37 ++--- pandas/tests/resample/test_time_grouper.py | 2 +- pandas/tests/window/test_groupby.py | 32 ++--- 21 files changed, 167 insertions(+), 158 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 089e15afd465b..5b18455dbe8a8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1831,7 +1831,7 @@ def f(g): message=_apply_groupings_depr.format( type(self).__name__, "apply" ), - category=FutureWarning, + category=DeprecationWarning, stacklevel=find_stack_level(), ) except TypeError: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 48a5f85e1c388..3e9507bd4347f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2906,7 +2906,7 @@ def _apply( new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") with rewrite_warning( target_message=target_message, - target_category=FutureWarning, + target_category=DeprecationWarning, new_message=new_message, ): result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 75628ea177fc2..414683b02dcba 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -114,13 +114,13 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("B", group_keys=False).apply(groupby_apply_op) - df.groupby("B", group_keys=False).A.apply(groupby_apply_op) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("A", group_keys=False).apply(groupby_apply_op) - df.groupby("A", group_keys=False).B.apply(groupby_apply_op) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 6e1e743eb60de..d8b92091260a3 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1825,7 +1825,7 @@ def test_unstack_bug(self, future_stack): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 0596193c137e1..00136e572288e 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -502,7 +502,7 @@ def test_agg_timezone_round_trip(): # GH#27110 applying iloc should return a DataFrame msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] @@ -510,7 +510,7 @@ def test_agg_timezone_round_trip(): # GH#27110 applying iloc should return a DataFrame msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 2fa79c815d282..8e25177368d8b 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -330,7 +330,7 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - warn = FutureWarning if groupby == "column" else None + warn = DeprecationWarning if groupby == "column" else None msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): expected = gp.apply( diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 34b6e7c4cde5f..0ddacfab8c102 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -28,7 +28,7 @@ def store(group): groups.append(group) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): df.groupby("index").apply(store) expected_value = DataFrame( {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) @@ -115,7 +115,7 @@ def test_apply_index_date_object(using_infer_string): ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("date", group_keys=False).apply( lambda x: x["time"][x["value"].idxmax()] ) @@ -227,7 +227,7 @@ def f_constant_df(group): del names[:] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): df.groupby("a", group_keys=False).apply(func) assert names == group_names @@ -247,7 +247,7 @@ def test_group_apply_once_per_group2(capsys): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): df.groupby("group_by_column", group_keys=False).apply( lambda df: print("function_called") ) @@ -271,9 +271,9 @@ def fast(group): return group.copy() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): fast_df = df.groupby("A", group_keys=False).apply(fast) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) @@ -297,7 +297,7 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("g", group_keys=False).apply(func) tm.assert_frame_equal(result, df) @@ -342,9 +342,9 @@ def test_groupby_as_index_apply(): tm.assert_index_equal(res_not_as, exp) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): res_as_apply = g_as.apply(lambda x: x.head(2)).index - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering @@ -359,7 +359,7 @@ def test_groupby_as_index_apply(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -390,17 +390,17 @@ def desc3(group): return result msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -432,7 +432,7 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -445,7 +445,7 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) @@ -456,7 +456,7 @@ def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan @@ -481,7 +481,7 @@ def trans2(group): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) @@ -512,7 +512,7 @@ def test_apply_chunk_view(group_keys): df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) if group_keys: @@ -535,7 +535,7 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grouped.apply(lambda x: x.sort_values("value", inplace=True)) @@ -554,7 +554,7 @@ def f(group): return group msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() @@ -580,7 +580,7 @@ def f(group): return group msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() @@ -620,9 +620,9 @@ def filt2(x): return x[x.category == "c"] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = data.groupby("id_field").apply(filt1) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -643,7 +643,7 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): tm.assert_series_equal(result, expected) else: msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("Y", group_keys=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis @@ -690,7 +690,7 @@ def f(g): return g msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(f) assert "value3" in result @@ -706,11 +706,11 @@ def test_apply_numeric_coercion_when_datetime(): {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) @@ -723,7 +723,7 @@ def get_B(g): return g.iloc[0][["B"]] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A @@ -750,9 +750,9 @@ def predictions(tool): df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df1.groupby("Key").apply(predictions).p1 - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -769,7 +769,7 @@ def test_apply_aggregating_timedelta_and_datetime(): ) df["time_delta_zero"] = df.datetime - df.datetime msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("clientid").apply( lambda ddf: Series( {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} @@ -818,13 +818,13 @@ def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1] @@ -870,7 +870,7 @@ def test_func(x): pass msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = test_df.groupby("groups").apply(test_func) expected = DataFrame() tm.assert_frame_equal(result, expected) @@ -887,9 +887,9 @@ def test_func(x): return x.iloc[[0, -1]] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result1 = test_df1.groupby("groups").apply(test_func) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result2 = test_df2.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) @@ -904,7 +904,7 @@ def test_groupby_apply_return_empty_chunk(): df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], @@ -933,7 +933,7 @@ def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) @@ -952,7 +952,7 @@ def test_apply_datetime_issue(group_column_dtlike, using_infer_string): df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) dtype = "string" if using_infer_string else "object" @@ -992,7 +992,7 @@ def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" @@ -1035,7 +1035,7 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): columns=["observation", "color", "mood", "intensity", "score"], ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes dtype = "string" if using_infer_string else object expected = Series( @@ -1058,7 +1058,7 @@ def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("group", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -1083,7 +1083,7 @@ def test_apply_function_returns_non_pandas_non_scalar(function, expected_values) # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -1097,7 +1097,7 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") @@ -1110,7 +1110,7 @@ def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], @@ -1148,7 +1148,7 @@ def test_apply_result_type(group_keys, udf): # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) @@ -1165,9 +1165,9 @@ def test_result_order_group_keys_false(): # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A", group_keys=False).apply(lambda x: x) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1181,11 +1181,11 @@ def test_apply_with_timezones_aware(): df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result1 = df1.groupby("x", group_keys=False).apply( lambda df: df[["x", "y"]].copy() ) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result2 = df2.groupby("x", group_keys=False).apply( lambda df: df[["x", "y"]].copy() ) @@ -1205,7 +1205,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): ) expected = DataFrame( - {"a": [264, 297], "b": [15, 6], "c": [150, 60]}, + {"b": [15, 6], "c": [150, 60]}, index=Index([88, 99], name="a"), ) @@ -1213,7 +1213,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): grp = df.groupby(by="a") msg = "The behavior of DataFrame.sum with axis=None is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + result = grp.apply(sum, include_groups=False) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() @@ -1221,7 +1221,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): args = get_groupby_method_args(reduction_func, df) _ = getattr(grp, reduction_func)(*args) with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + result = grp.apply(sum, include_groups=False) tm.assert_frame_equal(result, expected) @@ -1244,7 +1244,7 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): grp = df.groupby(["A", "B"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] @@ -1294,7 +1294,7 @@ def test_apply_dropna_with_indexed_same(dropna): index=list("xxyxz"), ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1321,7 +1321,7 @@ def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1333,7 +1333,7 @@ def test_sort_index_groups(): index=range(5), ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), @@ -1355,7 +1355,7 @@ def test_positional_slice_groups_datetimelike(): } ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = expected.groupby( [expected.let, expected.date.dt.date], group_keys=False ).apply(lambda x: x.iloc[0:]) @@ -1402,9 +1402,9 @@ def test_apply_na(dropna): ) dfgrp = df.groupby("grp", dropna=dropna) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) @@ -1412,7 +1412,7 @@ def test_apply_na(dropna): def test_apply_empty_string_nan_coerce_bug(): # GH#24903 msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = ( DataFrame( { @@ -1449,7 +1449,7 @@ def test_apply_index_key_error_bug(index_values): index=Index(["a2", "a3", "aa"], name="a"), ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = result.groupby("a").apply( lambda df: Series([df["b"].mean()], index=["b_mean"]) ) @@ -1501,7 +1501,7 @@ def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 expected = DataFrame({"col": arg}, index=idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = expected.groupby("col", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, expected) @@ -1554,7 +1554,7 @@ def test_include_groups(include_groups): # GH#7155 df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) gb = df.groupby("a") - warn = FutureWarning if include_groups else None + warn = DeprecationWarning if include_groups else None msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): result = gb.apply(lambda x: x.sum(), include_groups=include_groups) @@ -1590,11 +1590,11 @@ def test_builtins_apply(keys, f): npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = gb.apply(npfunc) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected2 = gb.apply(lambda x: npfunc(x)) tm.assert_frame_equal(result, expected2) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index 09d5e06bf6ddd..cfd1a4bca9d91 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -14,12 +14,12 @@ def test_group_by_copy(): ).set_index("name") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grp_by_same_value = df.groupby(["age"], group_keys=False).apply( lambda group: group ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grp_by_copy = df.groupby(["age"], group_keys=False).apply( lambda group: group.copy() ) @@ -54,9 +54,9 @@ def f_no_copy(x): return x.groupby("cat2")["rank"].min() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grpby_copy = df.groupby("cat1").apply(f_copy) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grpby_no_copy = df.groupby("cat1").apply(f_no_copy) tm.assert_series_equal(grpby_copy, grpby_no_copy) @@ -68,14 +68,14 @@ def test_no_mutate_but_looks_like(): df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) tm.assert_series_equal(result1, result2) -def test_apply_function_with_indexing(): +def test_apply_function_with_indexing(warn_copy_on_write): # GH: 33058 df = pd.DataFrame( {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} @@ -86,7 +86,9 @@ def fn(x): return x.col2 msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning( + DeprecationWarning, match=msg, raise_on_extra_warnings=not warn_copy_on_write + ): result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 7a91601bf688f..f60ff65536f20 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -125,7 +125,7 @@ def f(x): return x.drop_duplicates("person_name").iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") @@ -333,7 +333,7 @@ def test_apply(ordered): idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) @@ -2050,7 +2050,7 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = FutureWarning if method == "apply" and index_kind == "range" else None + warn = DeprecationWarning if method == "apply" and index_kind == "range" else None msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 16d7fe61b90ad..2622895f9f8d2 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -290,7 +290,7 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4c903e691add1..ed9acdd0c9dde 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -163,7 +163,7 @@ def max_value(group): return group.loc[group["value"].idxmax()] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): applied = df.groupby("A").apply(max_value) result = applied.dtypes expected = df.dtypes @@ -186,7 +186,7 @@ def f_0(grp): expected = df.groupby("A").first()[["B"]] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) @@ -196,7 +196,7 @@ def f_1(grp): return grp.iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(f_1)[["B"]] e = expected.copy() e.loc["Tiger"] = np.nan @@ -208,7 +208,7 @@ def f_2(grp): return grp.iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(f_2)[["B"]] e = expected.copy() e.loc["Pony"] = np.nan @@ -221,7 +221,7 @@ def f_3(grp): return grp.iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT @@ -234,7 +234,7 @@ def f_4(grp): return grp.iloc[0].loc["C"] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan @@ -421,9 +421,9 @@ def f3(x): # correct result msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result1 = df.groupby("a").apply(f1) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) @@ -1377,13 +1377,13 @@ def summarize_random_name(df): return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1678,7 +1678,7 @@ def test_dont_clobber_name_column(): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("key", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -1762,7 +1762,7 @@ def freducex(x): # make sure all these work msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) @@ -1785,7 +1785,7 @@ def f(group): return group.copy() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] @@ -1993,7 +1993,7 @@ def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): g.apply(test_sort) @@ -2180,7 +2180,7 @@ def test_empty_groupby_apply_nonunique_columns(): df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): res = gb.apply(lambda x: x) assert (res.dtypes == df.dtypes).all() diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 73638eba0a3b3..9155f2cccf117 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -325,7 +325,7 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 17ef6ee913463..0832b67b38098 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -74,7 +74,10 @@ def func(group): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning( - FutureWarning, match=msg, raise_on_extra_warnings=False + DeprecationWarning, + match=msg, + raise_on_extra_warnings=False, + check_stacklevel=False, ): result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) @@ -123,7 +126,10 @@ def test_groupby_resample_preserves_subclass(obj): # Confirm groupby.resample() preserves dataframe type msg = "DataFrameGroupBy.resample operated on the grouping columns" with tm.assert_produces_warning( - FutureWarning, match=msg, raise_on_extra_warnings=False + DeprecationWarning, + match=msg, + raise_on_extra_warnings=False, + check_stacklevel=False, ): result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 363ff883385db..d763b67059375 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -238,7 +238,7 @@ def test_grouper_creation_bug(self): tm.assert_frame_equal(result, expected) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = g.apply(lambda x: x.sum()) expected["A"] = [0, 2, 4] expected = expected.loc[:, ["A", "B"]] diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index d357a65e79796..8ef7c2b8ce859 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -478,10 +478,10 @@ def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) @@ -499,9 +499,9 @@ def sumfunc_value(x): return x.value.sum() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) @@ -929,7 +929,7 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( # function that returns a Series msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): res = gb.apply(lambda x: x["Quantity"] * 2) dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index a2ecd6c65db60..fd9bd5cc55538 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -668,7 +668,7 @@ def f(group): grouped = df.groupby("c") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = grouped.apply(f) assert result["d"].dtype == np.float64 @@ -826,7 +826,7 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): warn = None else: - warn = FutureWarning + warn = DeprecationWarning msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): expected = gb.apply(targop) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 80583f5d3c5f2..ddd81ab1d347d 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1080,10 +1080,10 @@ def test_resample_segfault(unit): ).set_index("timestamp") df.index = df.index.as_unit(unit) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("ID").resample("5min").sum() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1104,7 +1104,7 @@ def test_resample_dtype_preservation(unit): assert result.val.dtype == np.int32 msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1881,10 +1881,10 @@ def f(data, add_arg): df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 7e8779ab48b7e..d3e906827b754 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -78,7 +78,7 @@ def test_groupby_resample_api(): index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 337c5ff53bd14..550523a432a89 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -70,10 +70,10 @@ def f_0(x): return x.set_index("date").resample("D").asfreq() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby("id").apply(f_0) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) @@ -89,10 +89,10 @@ def f_1(x): return x.resample("1D").ffill() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = df.groupby("group").apply(f_1) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -109,7 +109,7 @@ def test_getitem(test_frame): tm.assert_series_equal(result, expected) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -235,10 +235,10 @@ def test_methods(f, test_frame): r = g.resample("2s") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = getattr(r, f)() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -257,10 +257,10 @@ def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = getattr(r, f)(ddof=1) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -271,14 +271,14 @@ def test_apply(test_frame): # reduction msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = r.apply(f_0) tm.assert_frame_equal(result, expected) @@ -286,7 +286,7 @@ def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") @@ -356,7 +356,7 @@ def test_resample_groupby_with_label(unit): index = date_range("2000-01-01", freq="2D", periods=5, unit=unit) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("col0").resample("1W", label="left").sum() mi = [ @@ -379,7 +379,7 @@ def test_consistency_with_window(test_frame): df = test_frame expected = Index([1, 2, 3], name="A") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -479,7 +479,7 @@ def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = ( DataFrame(columns=["a", "b"]) @@ -503,7 +503,8 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - with tm.assert_produces_warning(FutureWarning): + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ @@ -555,7 +556,7 @@ def test_resample_no_index(keys): df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) @@ -604,7 +605,7 @@ def test_groupby_resample_size_all_index_same(): index=date_range("31/12/2000 18:00", freq="h", periods=12), ) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("A").resample("D").size() mi_exp = pd.MultiIndex.from_arrays( diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 3d9098917a12d..3f9340b800eae 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -346,7 +346,7 @@ def test_groupby_resample_interpolate(): df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = ( df.set_index("week_starting") .groupby("volume") diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 400bf10817ab8..45e7e07affd75 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -101,7 +101,7 @@ def test_rolling(self, f, roll_frame): result = getattr(r, f)() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -117,7 +117,7 @@ def test_rolling_ddof(self, f, roll_frame): result = getattr(r, f)(ddof=1) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -135,7 +135,7 @@ def test_rolling_quantile(self, interpolation, roll_frame): result = r.quantile(0.4, interpolation=interpolation) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply( lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) ) @@ -182,7 +182,7 @@ def func(x): return getattr(x.rolling(4), f)(roll_frame) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) @@ -200,7 +200,7 @@ def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(func) tm.assert_series_equal(result, expected) @@ -247,7 +247,7 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -793,11 +793,11 @@ def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @@ -975,7 +975,7 @@ def test_groupby_monotonic(self): df = df.sort_values("date") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = ( df.set_index("date") .groupby("name") @@ -1000,7 +1000,7 @@ def test_datelike_on_monotonic_within_each_group(self): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = ( df.set_index("B") .groupby("A") @@ -1036,7 +1036,7 @@ def test_expanding(self, f, frame): result = getattr(r, f)() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.expanding(), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -1052,7 +1052,7 @@ def test_expanding_ddof(self, f, frame): result = getattr(r, f)(ddof=0) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -1070,7 +1070,7 @@ def test_expanding_quantile(self, interpolation, frame): result = r.quantile(0.4, interpolation=interpolation) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply( lambda x: x.expanding().quantile(0.4, interpolation=interpolation) ) @@ -1092,7 +1092,7 @@ def func_0(x): return getattr(x.expanding(), f)(frame) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values @@ -1109,7 +1109,7 @@ def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply(func_1) tm.assert_series_equal(result, expected) @@ -1120,7 +1120,7 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): expected = g.apply( lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) ) From fd3f57170aa1af588ba877e8e28c158a20a4886d Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 19 Jan 2024 12:10:08 -0800 Subject: [PATCH 060/396] RLS: 2.2.0 From 2fa26fd1fdaaef67b9cff812006b21fe0a76ebe9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 20 Jan 2024 01:20:40 +0100 Subject: [PATCH 061/396] =?UTF-8?q?Backport=20PR=20#56967=20on=20branch=20?= =?UTF-8?q?2.2.x=20(CI:=20Adjust=20pyarrow=20depr=20warning=20to=20account?= =?UTF-8?q?=20for=20different=20newlines=20on=20=E2=80=A6)=20(#56969)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #56967: CI: Adjust pyarrow depr warning to account for different newlines on … Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/tests/test_common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index fe24755e8cc23..4af71be11fe6b 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -286,4 +286,7 @@ def test_pyarrow_missing_warn(module): please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 """ # noqa: E501 stderr_msg = response.stderr.decode("utf-8") - assert msg in stderr_msg, stderr_msg + # Split by \n to avoid \r\n vs \n differences on Windows/Unix + # https://stackoverflow.com/questions/11989501/replacing-r-n-with-n + stderr_msg = "\n".join(stderr_msg.splitlines()) + assert msg in stderr_msg From f538741432edf55c6b9fb5d0d496d2dd1d7c2457 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 19 Jan 2024 16:38:41 -0800 Subject: [PATCH 062/396] RLS: 2.2.0 From 22ae7890ddc15422431a689f5f7e6aff65e2917b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 20 Jan 2024 21:07:16 +0100 Subject: [PATCH 063/396] Backport PR #56980 on branch 2.2.x (WEB: Add version 2.2 to the dropdown) (#56983) Backport PR #56980: WEB: Add version 2.2 to the dropdown Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- web/pandas/versions.json | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/web/pandas/versions.json b/web/pandas/versions.json index e355005c7c937..09b3d8492a2c7 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -5,11 +5,16 @@ "url": "https://pandas.pydata.org/docs/dev/" }, { - "name": "2.1 (stable)", - "version": "2.1", + "name": "2.2 (stable)", + "version": "2.2", "url": "https://pandas.pydata.org/docs/", "preferred": true }, + { + "name": "2.1", + "version": "2.1", + "url": "https://pandas.pydata.org/pandas-docs/version/2.1/", + }, { "name": "2.0", "version": "2.0", From 6c563e3299543746befc6822fe53dea5ddc48979 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 20 Jan 2024 21:36:53 +0100 Subject: [PATCH 064/396] Backport PR #56986 on branch 2.2.x (WEB: Fix typo in dropdown page) (#56987) Backport PR #56986: WEB: Fix typo in dropdown page Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- web/pandas/versions.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/pandas/versions.json b/web/pandas/versions.json index 09b3d8492a2c7..2d2599ae8585b 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -13,7 +13,7 @@ { "name": "2.1", "version": "2.1", - "url": "https://pandas.pydata.org/pandas-docs/version/2.1/", + "url": "https://pandas.pydata.org/pandas-docs/version/2.1/" }, { "name": "2.0", From bfe6c4f057e477544ee53a346d615cf875971aaf Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 22 Jan 2024 19:24:51 +0100 Subject: [PATCH 065/396] Backport PR #56982 on branch 2.2.x (DOC: Add release notes for 2.2.1) (#56998) Backport PR #56982: DOC: Add release notes for 2.2.1 Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.2.1.rst | 36 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 doc/source/whatsnew/v2.2.1.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index ec024f36d78b1..3a2ab4c17d1bd 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.2 .. toctree:: :maxdepth: 2 + v2.2.1 v2.2.0 Version 2.1 diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst new file mode 100644 index 0000000000000..22a6c0cd1028f --- /dev/null +++ b/doc/source/whatsnew/v2.2.1.rst @@ -0,0 +1,36 @@ +.. _whatsnew_221: + +What's new in 2.2.1 (February XX, 2024) +--------------------------------------- + +These are the changes in pandas 2.2.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.contributors: + +Contributors +~~~~~~~~~~~~ From 987dcbbce3c9bdb5e422ff6dee65bd057064b7dc Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 22 Jan 2024 21:10:25 +0100 Subject: [PATCH 066/396] Backport PR #57005 on branch 2.2.x (CI: pyarrow nightly failures) (#57013) Backport PR #57005: CI: pyarrow nightly failures Co-authored-by: Luke Manley --- pandas/tests/frame/test_arrow_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index ac7b51cbdfa92..098d1829b973c 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -35,11 +35,11 @@ def test_dataframe_arrow_interface(): def test_dataframe_to_arrow(): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - table = pa.RecordBatchReader.from_stream(df) + table = pa.RecordBatchReader.from_stream(df).read_all() expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) assert table.equals(expected) schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) - table = pa.RecordBatchReader.from_stream(df, schema=schema) + table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all() expected = expected.cast(schema) assert table.equals(expected) From 662e3f8632f112b3f85c68b62445766c8186396c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 23 Jan 2024 01:22:04 +0100 Subject: [PATCH 067/396] Backport PR #57011 on branch 2.2.x (Remove SKIP summary from CI logs) (#57020) Backport PR #57011: Remove SKIP summary from CI logs Co-authored-by: William Ayd --- ci/run_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 48ef21686a26f..39ab0890a32d1 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -10,7 +10,7 @@ echo PYTHONHASHSEED=$PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" From 3b833cfcc81ba55a1f46603b06fb9a043aa1bba9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 23 Jan 2024 01:22:19 +0100 Subject: [PATCH 068/396] Backport PR #57018 on branch 2.2.x (REGR: merge_ordered with fill_method="ffill" and how="left") (#57021) Backport PR #57018: REGR: merge_ordered with fill_method="ffill" and how="left" Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 2 +- pandas/core/reshape/merge.py | 7 +++--- .../tests/reshape/merge/test_merge_ordered.py | 23 +++++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 22a6c0cd1028f..75445c978d262 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -13,7 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 410301b7697f2..646f40f6141d8 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1930,10 +1930,9 @@ def get_result(self, copy: bool | None = True) -> DataFrame: if self.fill_method == "ffill": if left_indexer is None: - raise TypeError("left_indexer cannot be None") - left_indexer = cast("npt.NDArray[np.intp]", left_indexer) - right_indexer = cast("npt.NDArray[np.intp]", right_indexer) - left_join_indexer = libjoin.ffill_indexer(left_indexer) + left_join_indexer = None + else: + left_join_indexer = libjoin.ffill_indexer(left_indexer) if right_indexer is None: right_join_indexer = None else: diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index abd61026b4e37..0bd3ca3cf2c1b 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -219,3 +219,26 @@ def test_ffill_validate_fill_method(self, left, right, invalid_method): ValueError, match=re.escape("fill_method must be 'ffill' or None") ): merge_ordered(left, right, on="key", fill_method=invalid_method) + + def test_ffill_left_merge(self): + # GH 57010 + df1 = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + } + ) + df2 = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + result = merge_ordered( + df1, df2, fill_method="ffill", left_by="group", how="left" + ) + expected = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + "rvalue": [np.nan, 2.0, 2.0, np.nan, 2.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) From fb2cf0fca6f231d1c24fdbed06035457e47f3970 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:12:31 +0100 Subject: [PATCH 069/396] Backport PR #57057 on branch 2.2.x (COMPAT: Make argsort compatable with numpy 2.0) (#57062) Backport PR #57057: COMPAT: Make argsort compatable with numpy 2.0 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/compat/numpy/function.py | 2 ++ pandas/core/indexes/base.py | 2 +- pandas/core/series.py | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index a36e25a9df410..4df30f7f4a8a7 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -138,6 +138,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None ARGSORT_DEFAULTS["kind"] = None +ARGSORT_DEFAULTS["stable"] = None validate_argsort = CompatValidator( @@ -149,6 +150,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> ARGSORT_DEFAULTS_KIND: dict[str, int | None] = {} ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None +ARGSORT_DEFAULTS_KIND["stable"] = None validate_argsort_kind = CompatValidator( ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both" ) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 88a08dd55f739..c36dcda6e2972 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -956,7 +956,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): return self.__array_wrap__(result) @final - def __array_wrap__(self, result, context=None): + def __array_wrap__(self, result, context=None, return_scalar=False): """ Gets called after a ufunc and other functions e.g. np.split. """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 83eb545b9b681..f06a1eb533ba4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4066,6 +4066,7 @@ def argsort( axis: Axis = 0, kind: SortKind = "quicksort", order: None = None, + stable: None = None, ) -> Series: """ Return the integer indices that would sort the Series values. @@ -4082,6 +4083,8 @@ def argsort( information. 'mergesort' and 'stable' are the only stable algorithms. order : None Has no effect but is accepted for compatibility with numpy. + stable : None + Has no effect but is accepted for compatibility with numpy. Returns ------- From 2df78e8821d0e8d5ef51e3842db142faf32a64c1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:13:44 +0100 Subject: [PATCH 070/396] Backport PR #57058 on branch 2.2.x (BUG: Series.pct_change with empty Series) (#57059) Backport PR #57058: BUG: Series.pct_change with empty Series Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/generic.py | 27 ++++++++++--------- .../tests/series/methods/test_pct_change.py | 8 ++++++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 75445c978d262..d3065cdd8b624 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f8728c61e46fc..55693d4cdb753 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12126,19 +12126,20 @@ def pct_change( if limit is lib.no_default: cols = self.items() if self.ndim == 2 else [(None, self)] for _, col in cols: - mask = col.isna().values - mask = mask[np.argmax(~mask) :] - if mask.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will " - "be removed in a future version. Either fill in any " - "non-leading NA values prior to calling pct_change or " - "specify 'fill_method=None' to not fill NA values.", - FutureWarning, - stacklevel=find_stack_level(), - ) - break + if len(col) > 0: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and " + "will be removed in a future version. Either fill in " + "any non-leading NA values prior to calling pct_change " + "or specify 'fill_method=None' to not fill NA values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break fill_method = "pad" if limit is lib.no_default: limit = None diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 9727ef3d5c27c..6c80e711c3684 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -118,3 +118,11 @@ def test_pct_change_no_warning_na_beginning(): result = ser.pct_change() expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) tm.assert_series_equal(result, expected) + + +def test_pct_change_empty(): + # GH 57056 + ser = Series([], dtype="float64") + expected = ser.copy() + result = ser.pct_change(periods=0) + tm.assert_series_equal(expected, result) From 441f65df5c6dea850e6334ab770528184dfb0d33 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Jan 2024 19:03:14 +0100 Subject: [PATCH 071/396] Backport PR #57034 on branch 2.2.x (REGR: perf regression in Series.combine_first) (#57072) Backport PR #57034: REGR: perf regression in Series.combine_first Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/series.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index d3065cdd8b624..93965ffed23d3 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) diff --git a/pandas/core/series.py b/pandas/core/series.py index f06a1eb533ba4..c6a905cbb6ec1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -86,6 +86,7 @@ from pandas.core.dtypes.dtypes import ( CategoricalDtype, ExtensionDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -3510,6 +3511,13 @@ def combine_first(self, other) -> Series: """ from pandas.core.reshape.concat import concat + if self.dtype == other.dtype: + if self.index.equals(other.index): + return self.mask(self.isna(), other) + elif self._can_hold_na and not isinstance(self.dtype, SparseDtype): + this, other = self.align(other, join="outer") + return this.mask(this.isna(), other) + new_index = self.index.union(other.index) this = self From c45129431ef4b089b14f0f3a892cc115e155315b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 26 Jan 2024 19:41:39 +0100 Subject: [PATCH 072/396] Backport PR #57046 on branch 2.2.x (REGR: groupby.idxmin/idxmax wrong result on extreme values) (#57086) Backport PR #57046: REGR: groupby.idxmin/idxmax wrong result on extreme values Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 2 + pandas/_libs/groupby.pyx | 17 ++++--- pandas/core/groupby/ops.py | 1 + pandas/tests/groupby/test_reductions.py | 62 +++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 93965ffed23d3..23da4a7f6ab25 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -15,6 +15,8 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 19d71b0a6fde3..ac24c0bb8df88 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1767,6 +1767,7 @@ def group_idxmin_idxmax( Py_ssize_t i, j, N, K, lab numeric_object_t val numeric_object_t[:, ::1] group_min_or_max + uint8_t[:, ::1] seen bint uses_mask = mask is not None bint isna_entry bint compute_max = name == "idxmax" @@ -1780,13 +1781,10 @@ def group_idxmin_idxmax( if numeric_object_t is object: group_min_or_max = np.empty((out).shape, dtype=object) + seen = np.zeros((out).shape, dtype=np.uint8) else: group_min_or_max = np.empty_like(out, dtype=values.dtype) - if N > 0 and K > 0: - # When N or K is zero, we never use group_min_or_max - group_min_or_max[:] = _get_min_or_max( - values[0, 0], compute_max, is_datetimelike - ) + seen = np.zeros_like(out, dtype=np.uint8) # When using transform, we need a valid value for take in the case # a category is not observed; these values will be dropped @@ -1802,6 +1800,7 @@ def group_idxmin_idxmax( if not skipna and out[lab, j] == -1: # Once we've hit NA there is no going back continue + val = values[i, j] if uses_mask: @@ -1810,10 +1809,14 @@ def group_idxmin_idxmax( isna_entry = _treat_as_na(val, is_datetimelike) if isna_entry: - if not skipna: + if not skipna or not seen[lab, j]: out[lab, j] = -1 else: - if compute_max: + if not seen[lab, j]: + seen[lab, j] = True + group_min_or_max[lab, j] = val + out[lab, j] = i + elif compute_max: if val > group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5e83eaee02afc..e2ddf9aa5c0c1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -424,6 +424,7 @@ def _call_cython_op( mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, + **kwargs, ) elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: if self.how in ["std", "sem"]: diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 425079f943aba..422322d03c4c0 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -195,6 +195,68 @@ def test_empty(frame_or_series, bool_agg_func): tm.assert_equal(result, expected) +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype): + # GH#57040 + if any_real_numpy_dtype is int or any_real_numpy_dtype is float: + # No need to test + return + info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo + min_value = info(any_real_numpy_dtype).min + max_value = info(any_real_numpy_dtype).max + df = DataFrame( + {"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]}, + dtype=any_real_numpy_dtype, + ) + gb = df.groupby("a") + result = getattr(gb, how)() + expected = DataFrame( + {"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): + # GH#57040 + min_value = np.finfo(float_numpy_dtype).min + max_value = np.finfo(float_numpy_dtype).max + df = DataFrame( + { + "a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"), + "b": Series( + [ + np.nan, + min_value, + np.nan, + max_value, + min_value, + np.nan, + max_value, + np.nan, + np.nan, + np.nan, + ], + dtype=float_numpy_dtype, + ), + }, + ) + gb = df.groupby("a") + + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, how)(skipna=skipna) + if skipna: + values = [1, 3, 4, 6, np.nan] + else: + values = np.nan + expected = DataFrame( + {"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp") + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "func, values", [ From b44512726e9d55421f67683ff6d26ec7ce82046b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 26 Jan 2024 20:50:08 +0100 Subject: [PATCH 073/396] Backport PR #57084 on branch 2.2.x (Fix mem leak in read_csv) (#57090) Backport PR #57084: Fix mem leak in read_csv Co-authored-by: William Ayd --- asv_bench/benchmarks/io/csv.py | 3 +++ doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/src/parser/tokenizer.c | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9ac83db4f85b9..dae6107db4d92 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -408,6 +408,9 @@ def time_read_stringcsv(self, engine): def time_read_bytescsv(self, engine): read_csv(self.data(self.BytesIO_input), engine=engine) + def peakmem_read_csv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) + class ReadCSVCategorical(BaseIO): fname = "__test__.csv" diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 23da4a7f6ab25..b9b2821ebc468 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 0e4188bea4dc7..c9f7a796a9b1c 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -109,6 +109,14 @@ void parser_set_default_options(parser_t *self) { parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); } +static void parser_clear_data_buffers(parser_t *self) { + free_if_not_null((void *)&self->stream); + free_if_not_null((void *)&self->words); + free_if_not_null((void *)&self->word_starts); + free_if_not_null((void *)&self->line_start); + free_if_not_null((void *)&self->line_fields); +} + static void parser_cleanup(parser_t *self) { // XXX where to put this free_if_not_null((void *)&self->error_msg); @@ -119,6 +127,7 @@ static void parser_cleanup(parser_t *self) { self->skipset = NULL; } + parser_clear_data_buffers(self); if (self->cb_cleanup != NULL) { self->cb_cleanup(self->source); self->cb_cleanup = NULL; From 1550858f6e34f85bb9d489891d4c5ac3146b01b6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 27 Jan 2024 20:38:01 +0100 Subject: [PATCH 074/396] Backport PR #57078 on branch 2.2.x (54628 fix find stack level memory leak) (#57105) Backport PR #57078: 54628 fix find stack level memory leak Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_testing/_warnings.py | 7 ++++++- pandas/util/_exceptions.py | 24 +++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index f11dc11f6ac0d..c9a287942f2da 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -218,7 +218,12 @@ def _assert_raised_with_correct_stacklevel( frame = inspect.currentframe() for _ in range(4): frame = frame.f_back # type: ignore[union-attr] - caller_filename = inspect.getfile(frame) # type: ignore[arg-type] + try: + caller_filename = inspect.getfile(frame) # type: ignore[arg-type] + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame msg = ( "Warning not set with correct stacklevel. " f"File where warning is raised: {actual_warning.filename} != " diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 573f76a63459b..5f50838d37315 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: from collections.abc import Generator + from types import FrameType @contextlib.contextmanager @@ -42,15 +43,20 @@ def find_stack_level() -> int: test_dir = os.path.join(pkg_dir, "tests") # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow - frame = inspect.currentframe() - n = 0 - while frame: - fname = inspect.getfile(frame) - if fname.startswith(pkg_dir) and not fname.startswith(test_dir): - frame = frame.f_back - n += 1 - else: - break + frame: FrameType | None = inspect.currentframe() + try: + n = 0 + while frame: + filename = inspect.getfile(frame) + if filename.startswith(pkg_dir) and not filename.startswith(test_dir): + frame = frame.f_back + n += 1 + else: + break + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame return n From f577be25dfe97e6e4fa4192e007df2882fb3f20f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 04:44:40 +0100 Subject: [PATCH 075/396] Backport PR #57089 on branch 2.2.x (BUG: wide_to_long with string columns) (#57120) Backport PR #57089: BUG: wide_to_long with string columns Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/reshape/melt.py | 3 +-- pandas/core/strings/accessor.py | 4 ++-- pandas/tests/reshape/test_melt.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index b9b2821ebc468..660594c98e0f2 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -16,6 +16,7 @@ Fixed regressions - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bb1cd0d738dac..e54f847895f1a 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -458,8 +458,7 @@ def wide_to_long( def get_var_names(df, stub: str, sep: str, suffix: str): regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" - pattern = re.compile(regex) - return df.columns[df.columns.str.match(pattern)] + return df.columns[df.columns.str.match(regex)] def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 1b7d632c0fa80..da10a12d02ae4 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1336,14 +1336,14 @@ def contains( return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def match(self, pat, case: bool = True, flags: int = 0, na=None): + def match(self, pat: str, case: bool = True, flags: int = 0, na=None): """ Determine if each string starts with a match of a regular expression. Parameters ---------- pat : str - Character sequence or regular expression. + Character sequence. case : bool, default True If True, case sensitive. flags : int, default 0 (no flags) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index ff9f927597956..272c5b3403293 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -1220,3 +1220,33 @@ def test_missing_stubname(self, dtype): new_level = expected.index.levels[0].astype(dtype) expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) + + +def test_wide_to_long_pyarrow_string_columns(): + # GH 57066 + pytest.importorskip("pyarrow") + df = DataFrame( + { + "ID": {0: 1}, + "R_test1": {0: 1}, + "R_test2": {0: 1}, + "R_test3": {0: 2}, + "D": {0: 1}, + } + ) + df.columns = df.columns.astype("string[pyarrow_numpy]") + result = wide_to_long( + df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" + ) + expected = DataFrame( + [[1, 1], [1, 1], [1, 2]], + columns=Index(["D", "R"], dtype=object), + index=pd.MultiIndex.from_arrays( + [ + [1, 1, 1], + Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"), + ], + names=["ID", "UNPIVOTED"], + ), + ) + tm.assert_frame_equal(result, expected) From df0762d18eecb567b6813cc5fd5fabc6af214aa5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 21:53:39 +0100 Subject: [PATCH 076/396] Backport PR #57126 on branch 2.2.x (Bump pypa/cibuildwheel from 2.16.2 to 2.16.4) (#57132) Backport PR #57126: Bump pypa/cibuildwheel from 2.16.2 to 2.16.4 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 841559c8e9799..6d3b9048a2122 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -139,7 +139,7 @@ jobs: - name: Build normal wheels if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} - uses: pypa/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.16.4 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: @@ -148,7 +148,7 @@ jobs: - name: Build nightly wheels (with NumPy pre-release) if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.16.4 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From c1723cd09e83248dd3fbc81bb43ba537082a4fe0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 21:53:49 +0100 Subject: [PATCH 077/396] Backport PR #57101 on branch 2.2.x (REGR: Index.join raising TypeError when joining an empty index to a mixed type index) (#57133) Backport PR #57101: REGR: Index.join raising TypeError when joining an empty index to a mixed type index Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/base.py | 65 +++++++++++++------------ pandas/tests/reshape/merge/test_join.py | 19 ++++++++ 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 660594c98e0f2..ff5fadec735e6 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c36dcda6e2972..5e7f2e27f1275 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4615,38 +4615,12 @@ def join( if level is not None and (self._is_multi or other._is_multi): return self._join_level(other, level, how=how) - lidx: np.ndarray | None - ridx: np.ndarray | None - - if len(other) == 0: - if how in ("left", "outer"): - if sort and not self.is_monotonic_increasing: - lidx = self.argsort() - join_index = self.take(lidx) - else: - lidx = None - join_index = self._view() - ridx = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lidx, ridx - elif how in ("right", "inner", "cross"): - join_index = other._view() - lidx = np.array([], dtype=np.intp) - return join_index, lidx, None - - if len(self) == 0: - if how in ("right", "outer"): - if sort and not other.is_monotonic_increasing: - ridx = other.argsort() - join_index = other.take(ridx) - else: - ridx = None - join_index = other._view() - lidx = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lidx, ridx - elif how in ("left", "inner", "cross"): - join_index = self._view() - ridx = np.array([], dtype=np.intp) - return join_index, None, ridx + if len(self) == 0 or len(other) == 0: + try: + return self._join_empty(other, how, sort) + except TypeError: + # object dtype; non-comparable objects + pass if self.dtype != other.dtype: dtype = self._find_common_type_compat(other) @@ -4681,6 +4655,33 @@ def join( return self._join_via_get_indexer(other, how, sort) + @final + def _join_empty( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + assert len(self) == 0 or len(other) == 0 + _validate_join_method(how) + + lidx: np.ndarray | None + ridx: np.ndarray | None + + if len(other): + how = cast(JoinHow, {"left": "right", "right": "left"}.get(how, how)) + join_index, ridx, lidx = other._join_empty(self, how, sort) + elif how in ["left", "outer"]: + if sort and not self.is_monotonic_increasing: + lidx = self.argsort() + join_index = self.take(lidx) + else: + lidx = None + join_index = self._view() + ridx = np.broadcast_to(np.intp(-1), len(join_index)) + else: + join_index = other._view() + lidx = np.array([], dtype=np.intp) + ridx = None + return join_index, lidx, ridx + @final def _join_via_get_indexer( self, other: Index, how: JoinHow, sort: bool diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 9a2f18f33bce5..db5a0437a14f0 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1042,6 +1042,25 @@ def test_join_empty(left_empty, how, exp): tm.assert_frame_equal(result, expected) +def test_join_empty_uncomparable_columns(): + # GH 57048 + df1 = DataFrame() + df2 = DataFrame(columns=["test"]) + df3 = DataFrame(columns=["foo", ("bar", "baz")]) + + result = df1 + df2 + expected = DataFrame(columns=["test"]) + tm.assert_frame_equal(result, expected) + + result = df2 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo", "test"]) + tm.assert_frame_equal(result, expected) + + result = df1 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "how, values", [ From 10b58730b6a302a6c372d6d4e1b04cb87007de97 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 23:48:32 +0100 Subject: [PATCH 078/396] Backport PR #57122 on branch 2.2.x (CI: autouse add_doctest_imports) (#57135) Backport PR #57122: CI: autouse add_doctest_imports Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/conftest.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 983272d79081e..a11d9c2b4b2a1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -190,10 +190,6 @@ def pytest_collection_modifyitems(items, config) -> None: if is_doctest: for item in items: - # autouse=True for the add_doctest_imports can lead to expensive teardowns - # since doctest_namespace is a session fixture - item.add_marker(pytest.mark.usefixtures("add_doctest_imports")) - for path, message in ignored_doctest_warnings: ignore_doctest_warning(item, path, message) @@ -250,7 +246,14 @@ def pytest_collection_modifyitems(items, config) -> None: ) -@pytest.fixture +# ---------------------------------------------------------------- +# Autouse fixtures +# ---------------------------------------------------------------- + + +# https://github.com/pytest-dev/pytest/issues/11873 +# Would like to avoid autouse=True, but cannot as of pytest 8.0.0 +@pytest.fixture(autouse=True) def add_doctest_imports(doctest_namespace) -> None: """ Make `np` and `pd` names available for doctests. @@ -259,9 +262,6 @@ def add_doctest_imports(doctest_namespace) -> None: doctest_namespace["pd"] = pd -# ---------------------------------------------------------------- -# Autouse fixtures -# ---------------------------------------------------------------- @pytest.fixture(autouse=True) def configure_tests() -> None: """ From acd914dedba087b3642e110aadac665006da23c0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 05:36:42 +0100 Subject: [PATCH 079/396] Backport PR #57102 on branch 2.2.x (ENH: Add skipna to groupby.first and groupby.last) (#57141) Backport PR #57102: ENH: Add skipna to groupby.first and groupby.last Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 3 +- pandas/_libs/groupby.pyi | 2 ++ pandas/_libs/groupby.pyx | 41 ++++++++++++++-------- pandas/_testing/__init__.py | 7 ++++ pandas/conftest.py | 32 +++++++++++++++++ pandas/core/groupby/groupby.py | 36 ++++++++++++++----- pandas/core/resample.py | 10 ++++-- pandas/tests/groupby/test_reductions.py | 31 ++++++++++++++++ pandas/tests/resample/test_base.py | 29 +++++++++++++++ pandas/tests/resample/test_resample_api.py | 4 +-- 10 files changed, 167 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index ff5fadec735e6..589903ddcca71 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -34,7 +34,8 @@ Bug fixes Other ~~~~~ -- +- Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`) +- Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`) .. --------------------------------------------------------------------------- .. _whatsnew_221.contributors: diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 135828a23648a..a494b61fa7e3d 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -136,6 +136,7 @@ def group_last( result_mask: npt.NDArray[np.bool_] | None = ..., min_count: int = ..., # Py_ssize_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_nth( out: np.ndarray, # rank_t[:, ::1] @@ -147,6 +148,7 @@ def group_nth( min_count: int = ..., # int64_t rank: int = ..., # int64_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_rank( out: np.ndarray, # float64_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ac24c0bb8df88..b855d64d0be18 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1424,6 +1424,7 @@ def group_last( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1458,14 +1459,19 @@ def group_last( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - resx[lab, j] = val + nobs[lab, j] += 1 + resx[lab, j] = val + + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx @@ -1486,6 +1492,7 @@ def group_nth( int64_t min_count=-1, int64_t rank=1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1520,15 +1527,19 @@ def group_nth( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 672c16a85086c..361998db8e38b 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -236,11 +236,18 @@ + TIMEDELTA_PYARROW_DTYPES + BOOL_PYARROW_DTYPES ) + ALL_REAL_PYARROW_DTYPES_STR_REPR = ( + ALL_INT_PYARROW_DTYPES_STR_REPR + FLOAT_PYARROW_DTYPES_STR_REPR + ) else: FLOAT_PYARROW_DTYPES_STR_REPR = [] ALL_INT_PYARROW_DTYPES_STR_REPR = [] ALL_PYARROW_DTYPES = [] + ALL_REAL_PYARROW_DTYPES_STR_REPR = [] +ALL_REAL_NULLABLE_DTYPES = ( + FLOAT_NUMPY_DTYPES + ALL_REAL_EXTENSION_DTYPES + ALL_REAL_PYARROW_DTYPES_STR_REPR +) arithmetic_dunder_methods = [ "__add__", diff --git a/pandas/conftest.py b/pandas/conftest.py index a11d9c2b4b2a1..7c35dfdde90ba 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1642,6 +1642,38 @@ def any_numpy_dtype(request): return request.param +@pytest.fixture(params=tm.ALL_REAL_NULLABLE_DTYPES) +def any_real_nullable_dtype(request): + """ + Parameterized fixture for all real dtypes that can hold NA. + + * float + * 'float32' + * 'float64' + * 'Float32' + * 'Float64' + * 'UInt8' + * 'UInt16' + * 'UInt32' + * 'UInt64' + * 'Int8' + * 'Int16' + * 'Int32' + * 'Int64' + * 'uint8[pyarrow]' + * 'uint16[pyarrow]' + * 'uint32[pyarrow]' + * 'uint64[pyarrow]' + * 'int8[pyarrow]' + * 'int16[pyarrow]' + * 'int32[pyarrow]' + * 'int64[pyarrow]' + * 'float[pyarrow]' + * 'double[pyarrow]' + """ + return request.param + + @pytest.fixture(params=tm.ALL_NUMERIC_DTYPES) def any_numeric_dtype(request): """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5b18455dbe8a8..db8949788567b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3335,9 +3335,13 @@ def max( ) @final - def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def first( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the first non-null entry of each column. + Compute the first entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3345,12 +3349,17 @@ def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: Include only float, int, boolean columns. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - First non-null of values within each group. + First values within each group. See Also -------- @@ -3402,12 +3411,17 @@ def first(x: Series): min_count=min_count, alias="first", npfunc=first_compat, + skipna=skipna, ) @final - def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def last( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the last non-null entry of each column. + Compute the last entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3416,12 +3430,17 @@ def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: everything, then use only numeric data. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - Last non-null of values within each group. + Last of values within each group. See Also -------- @@ -3461,6 +3480,7 @@ def last(x: Series): min_count=min_count, alias="last", npfunc=last_compat, + skipna=skipna, ) @final diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 3e9507bd4347f..2d430ef4dcff6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1306,12 +1306,15 @@ def first( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "first", args, kwargs) nv.validate_resampler_func("first", args, kwargs) - return self._downsample("first", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final @doc(GroupBy.last) @@ -1319,12 +1322,15 @@ def last( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "last", args, kwargs) nv.validate_resampler_func("last", args, kwargs) - return self._downsample("last", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final @doc(GroupBy.median) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 422322d03c4c0..25b0f80639cff 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -7,6 +7,9 @@ from pandas._libs.tslibs import iNaT +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.missing import na_value_for_dtype + import pandas as pd from pandas import ( DataFrame, @@ -327,6 +330,34 @@ def test_groupby_non_arithmetic_agg_int_like_precision(method, data): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): + # GH#57019 + na_value = na_value_for_dtype(pandas_dtype(any_real_nullable_dtype)) + df = DataFrame( + { + "a": [2, 1, 1, 2, 3, 3], + "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + }, + dtype=any_real_nullable_dtype, + ) + gb = df.groupby("a", sort=sort) + method = getattr(gb, how) + result = method(skipna=skipna) + + ilocs = { + ("first", True): [3, 1, 4], + ("first", False): [0, 1, 4], + ("last", True): [3, 1, 5], + ("last", False): [3, 2, 5], + }[how, skipna] + expected = df.iloc[ilocs].set_index("a") + if sort: + expected = expected.sort_index() + tm.assert_frame_equal(result, expected) + + def test_idxmin_idxmax_axis1(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 50644e33e45e1..dcf6c6099abab 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -3,6 +3,9 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_extension_array_dtype + +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -429,3 +432,29 @@ def test_resample_quantile(series): result = ser.resample(freq).quantile(q) expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, skipna, how): + # GH#57019 + if is_extension_array_dtype(any_real_nullable_dtype): + na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value + else: + na_value = np.nan + df = DataFrame( + { + "a": [2, 1, 1, 2], + "b": [na_value, 3.0, na_value, 4.0], + "c": [na_value, 3.0, na_value, 4.0], + }, + index=date_range("2020-01-01", periods=4, freq="D"), + dtype=any_real_nullable_dtype, + ) + rs = df.resample("ME") + method = getattr(rs, how) + result = method(skipna=skipna) + + gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")]) + expected = getattr(gb, how)(skipna=skipna) + expected.index.freq = "ME" + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index d3e906827b754..12abd1c98784b 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -1040,11 +1040,11 @@ def test_args_kwargs_depr(method, raises): if raises: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(UnsupportedFunctionCall, match=error_msg): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) else: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(TypeError, match=error_msg_type): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) def test_df_axis_param_depr(): From c0a269b354ad1b50b9a7fd56789c5137e829fcc2 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 05:37:02 +0100 Subject: [PATCH 080/396] Backport PR #57061 on branch 2.2.x (REGR: non-unique, masked dtype index raising IndexError) (#57142) Backport PR #57061: REGR: non-unique, masked dtype index raising IndexError Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/index.pyx | 63 +++++++++++++++---------------- pandas/tests/indexing/test_loc.py | 12 ++++++ 3 files changed, 44 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 589903ddcca71..6d6f0f1ee758f 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) +- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0dc139781f58d..e4dfe9dec3623 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -96,6 +96,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): return indexer.view(bool) +cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length): + """ + Resize array if loc is out of bounds. + """ + cdef: + Py_ssize_t n = len(values) + + if loc >= n: + while loc >= n: + n *= 2 + values = np.resize(values, min(n, max_length)) + return values + + # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 @@ -450,27 +464,18 @@ cdef class IndexEngine: # found if val in d: key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) - result[count] = j count += 1 # value not found else: - - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i @@ -1193,13 +1198,12 @@ cdef class MaskedIndexEngine(IndexEngine): if PySequence_GetItem(target_mask, i): if na_pos: + result = _maybe_resize_array( + result, + count + len(na_pos) - 1, + max_alloc, + ) for na_idx in na_pos: - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = na_idx count += 1 continue @@ -1207,23 +1211,18 @@ cdef class MaskedIndexEngine(IndexEngine): elif val in d: # found key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc, + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = j count += 1 continue # value not found - if count >= n_alloc: - n_alloc += 10_000 - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 61c44c8a2a8f4..952251a58e981 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -3364,3 +3364,15 @@ def test_getitem_loc_str_periodindex(self): index = pd.period_range(start="2000", periods=20, freq="B") series = Series(range(20), index=index) assert series.loc["2000-01-14"] == 9 + + def test_loc_nonunique_masked_index(self): + # GH 57027 + ids = list(range(11)) + index = Index(ids * 1000, dtype="Int64") + df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index) + result = df.loc[ids] + expected = DataFrame( + {"val": index.argsort(kind="stable").astype(np.intp)}, + index=Index(np.array(ids).repeat(1000), dtype="Int64"), + ) + tm.assert_frame_equal(result, expected) From 4bad5fcf172ac443c384da8482a3cc617da908bb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 05:37:17 +0100 Subject: [PATCH 081/396] Backport PR #57139 on branch 2.2.x (BUG: Index(Series) makes array read only for object dtype) (#57143) Backport PR #57139: BUG: Index(Series) makes array read only for object dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 2 +- pandas/_libs/ops.pyx | 2 +- pandas/core/common.py | 2 ++ pandas/tests/indexes/base_class/test_constructors.py | 7 +++++++ pandas/tests/indexes/test_common.py | 9 +++++++++ 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 6d6f0f1ee758f..1302648c3fc9a 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -28,7 +28,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) .. --------------------------------------------------------------------------- .. _whatsnew_221.other: diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 9154e836b3477..567bfc02a2950 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -29,7 +29,7 @@ from pandas._libs.util cimport is_nan @cython.wraparound(False) @cython.boundscheck(False) -def scalar_compare(object[:] values, object val, object op) -> ndarray: +def scalar_compare(ndarray[object] values, object val, object op) -> ndarray: """ Compare each element of `values` array with the scalar `val`, with the comparison operation described by `op`. diff --git a/pandas/core/common.py b/pandas/core/common.py index 7d864e02be54e..9f024498d66ed 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -233,6 +233,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi values = list(values) elif isinstance(values, ABCIndex): return values._values + elif isinstance(values, ABCSeries): + return values._values if isinstance(values, list) and dtype in [np.object_, object]: return construct_1d_object_array_from_listlike(values) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index fd5176a28565e..338509dd239e6 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -71,3 +71,10 @@ def test_inference_on_pandas_objects(self): with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): result = Index(ser) assert result.dtype != np.object_ + + def test_constructor_not_read_only(self): + # GH#57130 + ser = Series([1, 2], dtype=object) + with pd.option_context("mode.copy_on_write", True): + idx = Index(ser) + assert idx._values.flags.writeable diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 412a59d15307d..80c39322b9b81 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -500,3 +500,12 @@ def test_ndarray_compat_properties(index): # test for validity idx.nbytes idx.values.nbytes + + +def test_compare_read_only_array(): + # GH#57130 + arr = np.array([], dtype=object) + arr.flags.writeable = False + idx = pd.Index(arr) + result = idx > 69 + assert result.dtype == bool From 27cea3a8d338e2ebaa809e9079de19fcb3991cee Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 18:46:41 +0100 Subject: [PATCH 082/396] Backport PR #57144 on branch 2.2.x (CI: Fix _get_dst_hours for numpy 2.0 change) (#57153) Backport PR #57144: CI: Fix _get_dst_hours for numpy 2.0 change Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_libs/tslibs/tzconversion.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 2c4f0cd14db13..e3facd3d9599b 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -607,7 +607,8 @@ cdef ndarray[int64_t] _get_dst_hours( ndarray[uint8_t, cast=True] mismatch ndarray[int64_t] delta, dst_hours ndarray[intp_t] switch_idxs, trans_idx, grp, a_idx, b_idx, one_diff - list trans_grp + # TODO: Can uncomment when numpy >=2 is the minimum + # tuple trans_grp intp_t switch_idx int64_t left, right From f6fd475680761f12dbc4abe3db4a26e55b68fd90 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Jan 2024 02:52:25 +0100 Subject: [PATCH 083/396] Backport PR #57157 on branch 2.2.x (BUG: Fix to_dict with datelike types and orient=list) (#57160) Backport PR #57157: BUG: Fix to_dict with datelike types and orient=list Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/methods/to_dict.py | 8 ++------ pandas/tests/frame/methods/test_to_dict.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 1302648c3fc9a..19b7e3493f964 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 7bd4851425c3b..accbd92a91ed6 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -171,13 +171,9 @@ def to_dict( return into_c( ( k, - list( - map( - maybe_box_native, v.to_numpy(na_value=box_na_values[i]).tolist() - ) - ) + list(map(maybe_box_native, v.to_numpy(na_value=box_na_values[i]))) if i in object_dtype_indices_as_set - else v.to_numpy().tolist(), + else list(map(maybe_box_native, v.to_numpy())), ) for i, (k, v) in enumerate(df.items()) ) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 61f0ad30b4519..570f85a4a31ee 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -12,8 +12,11 @@ NA, DataFrame, Index, + Interval, MultiIndex, + Period, Series, + Timedelta, Timestamp, ) import pandas._testing as tm @@ -519,3 +522,14 @@ def test_to_dict_pos_args_deprecation(self): ) with tm.assert_produces_warning(FutureWarning, match=msg): df.to_dict("records", {}) + + +@pytest.mark.parametrize( + "val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)] +) +def test_to_dict_list_pd_scalars(val): + # GH 54824 + df = DataFrame({"a": [val]}) + result = df.to_dict(orient="list") + expected = {"a": [val]} + assert result == expected From 59e6c80a47dd5bf0799e622eae7b3a0c5864309f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Jan 2024 20:06:35 +0100 Subject: [PATCH 084/396] Backport PR #57175 on branch 2.2.x (BUG: Interchange protocol implementation handles empty dataframes incorrectly) (#57179) Backport PR #57175: BUG: Interchange protocol implementation handles empty dataframes incorrectly Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/interchange/buffer.py | 2 +- pandas/tests/interchange/test_impl.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 19b7e3493f964..4a20eda08937a 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -29,6 +29,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py index a54e4428bd836..5c97fc17d7070 100644 --- a/pandas/core/interchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -23,7 +23,7 @@ def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: """ Handle only regular columns (= numpy arrays) for now. """ - if not x.strides == (x.dtype.itemsize,): + if x.strides[0] and not x.strides == (x.dtype.itemsize,): # The protocol does not support strided buffers, so a copy is # necessary. If that's not allowed, we need to raise an exception. if allow_copy: diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index c7b13f9fd7b2d..c6365233728d2 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -379,3 +379,12 @@ def test_large_string(): result = pd.api.interchange.from_dataframe(df.__dataframe__()) expected = pd.DataFrame({"a": ["x"]}, dtype="object") tm.assert_frame_equal(result, expected) + + +def test_empty_dataframe(): + # https://github.com/pandas-dev/pandas/issues/56700 + df = pd.DataFrame({"a": []}, dtype="int8") + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi, allow_copy=False) + expected = pd.DataFrame({"a": []}, dtype="int8") + tm.assert_frame_equal(result, expected) From bc09d5765197c39daae8326fc9ee44bb2c3153c4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Jan 2024 20:20:42 +0100 Subject: [PATCH 085/396] Backport PR #57169 on branch 2.2.x (REGR: DataFrame.sort_index not producing stable sort) (#57180) Backport PR #57169: REGR: DataFrame.sort_index not producing stable sort Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/base.py | 11 ++++----- pandas/tests/frame/methods/test_sort_index.py | 24 +++++++++++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 4a20eda08937a..9002d9af2c602 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5e7f2e27f1275..6f9078bf5a2a2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5914,17 +5914,14 @@ def sort_values( (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) """ if key is None and ( - self.is_monotonic_increasing or self.is_monotonic_decreasing + (ascending and self.is_monotonic_increasing) + or (not ascending and self.is_monotonic_decreasing) ): - reverse = ascending != self.is_monotonic_increasing - sorted_index = self[::-1] if reverse else self.copy() if return_indexer: indexer = np.arange(len(self), dtype=np.intp) - if reverse: - indexer = indexer[::-1] - return sorted_index, indexer + return self.copy(), indexer else: - return sorted_index + return self.copy() # GH 35584. Sort missing values according to na_position kwarg # ignore na_position for MultiIndex diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 49e292057e4dc..830561a1349ee 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -1002,3 +1002,27 @@ def test_axis_columns_ignore_index(): result = df.sort_index(axis="columns", ignore_index=True) expected = DataFrame([[2, 1]]) tm.assert_frame_equal(result, expected) + + +def test_sort_index_stable_sort(): + # GH 57151 + df = DataFrame( + data=[ + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + result = df.sort_index(level="dt", kind="stable") + expected = DataFrame( + data=[ + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + tm.assert_frame_equal(result, expected) From 62aea0f06dc8bb7b029f551ef2bcfb8b08ee8ddf Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 1 Feb 2024 21:15:39 +0000 Subject: [PATCH 086/396] =?UTF-8?q?Backport=20PR=20#57173:=20BUG:=20pandas?= =?UTF-8?q?=20int=20extension=20dtypes=20has=20no=20attribute=E2=80=A6=20(?= =?UTF-8?q?#57198)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #57173: BUG: pandas int extension dtypes has no attribute byteorder --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/interchange/column.py | 3 +++ pandas/tests/interchange/test_impl.py | 12 ++++++++++++ 3 files changed, 16 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 9002d9af2c602..56e645d4c55db 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -30,6 +30,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`) - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index ee1b5cd34a7f7..96757072dd854 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.dtypes import ( ArrowDtype, + BaseMaskedDtype, DatetimeTZDtype, ) @@ -143,6 +144,8 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: byteorder = dtype.numpy_dtype.byteorder elif isinstance(dtype, DatetimeTZDtype): byteorder = dtype.base.byteorder # type: ignore[union-attr] + elif isinstance(dtype, BaseMaskedDtype): + byteorder = dtype.numpy_dtype.byteorder else: byteorder = dtype.byteorder diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index c6365233728d2..209944427d5dc 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -9,6 +9,7 @@ is_platform_windows, ) from pandas.compat.numpy import np_version_lt1p23 +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -381,6 +382,17 @@ def test_large_string(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))] +) +def test_nullable_integers(dtype: str) -> None: + # https://github.com/pandas-dev/pandas/issues/55069 + df = pd.DataFrame({"a": [1]}, dtype=dtype) + expected = pd.DataFrame({"a": [1]}, dtype="int8") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + tm.assert_frame_equal(result, expected) + + def test_empty_dataframe(): # https://github.com/pandas-dev/pandas/issues/56700 df = pd.DataFrame({"a": []}, dtype="int8") From be8f9f267473133f5436cef564bd13e2872f9bec Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 2 Feb 2024 03:37:48 +0100 Subject: [PATCH 087/396] Backport PR #57163 on branch 2.2.x (CI: Add macOS M1 CI) (#57202) Backport PR #57163: CI: Add macOS M1 CI Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 9 +++++---- .github/workflows/wheels.yml | 18 ++++++++++-------- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311.yaml | 3 +-- ci/deps/actions-312.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- pyproject.toml | 4 ---- 7 files changed, 19 insertions(+), 21 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index a3cffb4b03b93..4c7aa1e1e49ee 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -214,7 +214,8 @@ jobs: timeout-minutes: 90 strategy: matrix: - os: [macos-latest, windows-latest] + # Note: Don't use macOS latest since macos 14 appears to be arm64 only + os: [macos-13, macos-14, windows-latest] env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] fail-fast: false runs-on: ${{ matrix.os }} @@ -227,8 +228,7 @@ jobs: PANDAS_CI: 1 PYTEST_TARGET: pandas PATTERN: "not slow and not db and not network and not single_cpu" - # GH 47443: PYTEST_WORKERS > 0 crashes Windows builds with memory related errors - PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '0' }} + PYTEST_WORKERS: 'auto' steps: - name: Checkout @@ -354,7 +354,8 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-22.04, macOS-latest, windows-latest] + # Separate out macOS 13 and 14, since macOS 14 is arm64 only + os: [ubuntu-22.04, macOS-13, macOS-14, windows-latest] timeout-minutes: 90 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 6d3b9048a2122..f79b2c51b5f92 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -94,7 +94,9 @@ jobs: buildplat: - [ubuntu-22.04, manylinux_x86_64] - [ubuntu-22.04, musllinux_x86_64] - - [macos-12, macosx_*] + - [macos-12, macosx_x86_64] + # Note: M1 images on Github Actions start from macOS 14 + - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] @@ -128,7 +130,7 @@ jobs: # Python version used to build sdist doesn't matter # wheel will be built from sdist with the correct version - name: Unzip sdist (macOS) - if: ${{ matrix.buildplat[1] == 'macosx_*' }} + if: ${{ startsWith(matrix.buildplat[1], 'macosx') }} run: | tar -xzf ./dist/${{ env.sdist_name }} -C ./dist @@ -139,18 +141,18 @@ jobs: - name: Build normal wheels if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} - uses: pypa/cibuildwheel@v2.16.4 + uses: pypa/cibuildwheel@v2.16.5 with: - package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} + package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} - name: Build nightly wheels (with NumPy pre-release) if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.16.4 + uses: pypa/cibuildwheel@v2.16.5 with: - package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} + package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: # The nightly wheels should be build witht he NumPy 2.0 pre-releases # which requires the additional URL. @@ -183,7 +185,7 @@ jobs: $TST_CMD = @" python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); - python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; + python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; "@ # add rc to the end of the image name if the Python version is unreleased docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} @@ -191,7 +193,7 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: ${{ matrix.python[0] }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }} + name: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} path: ./wheelhouse/*.whl - name: Upload wheels & sdist diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 45f114322015b..a3e44e6373145 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -61,3 +60,4 @@ dependencies: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index d14686696e669..95cd1a4d46ef4 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -60,4 +59,4 @@ dependencies: - pip: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 86aaf24b4e15c..a442ed6feeb5d 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -61,3 +60,4 @@ dependencies: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 31ee74174cd46..b162a78e7f115 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -61,3 +60,4 @@ dependencies: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/pyproject.toml b/pyproject.toml index 2f70ade7b3afe..8c9a79aa2b059 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -162,10 +162,6 @@ test-command = """ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ -[tool.cibuildwheel.macos] -archs = "x86_64 arm64" -test-skip = "*_arm64" - [tool.cibuildwheel.windows] before-build = "pip install delvewheel" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" From 5034b780e90f499d4da8dc2e461aca19304c5263 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 2 Feb 2024 03:38:20 +0100 Subject: [PATCH 088/396] Backport PR #57174 on branch 2.2.x (BUG: Interchange protocol implementation allows non-string column names) (#57203) Backport PR #57174: BUG: Interchange protocol implementation allows non-string column names Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/interchange/column.py | 8 ++++++++ pandas/core/interchange/dataframe.py | 2 +- pandas/tests/interchange/test_impl.py | 26 ++++++++++++++++++++++++-- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 56e645d4c55db..13d5024b5a131 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -32,6 +32,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`) - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 96757072dd854..e273ecad8b51e 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -77,6 +77,14 @@ def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. """ + if isinstance(column, pd.DataFrame): + raise TypeError( + "Expected a Series, got a DataFrame. This likely happened " + "because you called __dataframe__ on a DataFrame which, " + "after converting column names to string, resulted in duplicated " + f"names: {column.columns}. Please rename these columns before " + "using the interchange protocol." + ) if not isinstance(column, pd.Series): raise NotImplementedError(f"Columns of type {type(column)} not handled yet") diff --git a/pandas/core/interchange/dataframe.py b/pandas/core/interchange/dataframe.py index 4f08b2c2b3a7b..1ffe0e8e8dbb0 100644 --- a/pandas/core/interchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -32,7 +32,7 @@ def __init__(self, df: DataFrame, allow_copy: bool = True) -> None: Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. """ - self._df = df + self._df = df.rename(columns=str, copy=False) self._allow_copy = allow_copy def __dataframe__( diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 209944427d5dc..d47b533f92235 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -180,8 +180,6 @@ def test_missing_from_masked(): } ) - df2 = df.__dataframe__() - rng = np.random.default_rng(2) dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns} for col, num_nulls in dict_null.items(): @@ -382,6 +380,30 @@ def test_large_string(): tm.assert_frame_equal(result, expected) +def test_non_str_names(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.Series([1, 2, 3], name=0).to_frame() + names = df.__dataframe__().column_names() + assert names == ["0"] + + +def test_non_str_names_w_duplicates(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) + dfi = df.__dataframe__() + with pytest.raises( + TypeError, + match=( + "Expected a Series, got a DataFrame. This likely happened because you " + "called __dataframe__ on a DataFrame which, after converting column " + r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " + r"dtype='object'\). Please rename these columns before using the " + "interchange protocol." + ), + ): + pd.api.interchange.from_dataframe(dfi, allow_copy=False) + + @pytest.mark.parametrize( "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))] ) From e54e0e2a63af60e88fad5d5a592bdf957521f3a9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 4 Feb 2024 04:08:14 +0100 Subject: [PATCH 089/396] Backport PR #57232 on branch 2.2.x (REGR: to_json converting nullable ints to floats) (#57240) Backport PR #57232: REGR: to_json converting nullable ints to floats Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/arrow/array.py | 5 +++++ pandas/core/arrays/masked.py | 3 +++ pandas/tests/io/json/test_pandas.py | 16 ++++++++++++++++ 4 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 13d5024b5a131..3cc11974b14e5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) +- Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a5ce46ed612f3..621a32a049f3b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1348,6 +1348,11 @@ def _to_timedeltaarray(self) -> TimedeltaArray: np_array = np_array.astype(np_dtype) return TimedeltaArray._simple_new(np_array, dtype=np_dtype) + def _values_for_json(self) -> np.ndarray: + if is_numeric_dtype(self.dtype): + return np.asarray(self, dtype=object) + return super()._values_for_json() + @doc(ExtensionArray.to_numpy) def to_numpy( self, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 234d96e53a67c..8c41de9c9fffa 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -430,6 +430,9 @@ def __abs__(self) -> Self: # ------------------------------------------------------------------ + def _values_for_json(self) -> np.ndarray: + return np.asarray(self, dtype=object) + def to_numpy( self, dtype: npt.DTypeLike | None = None, diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1da27ad173235..caa25841d3596 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2172,3 +2172,19 @@ def test_json_pos_args_deprecation(): with tm.assert_produces_warning(FutureWarning, match=msg): buf = BytesIO() df.to_json(buf, "split") + + +@td.skip_if_no("pyarrow") +def test_to_json_ea_null(): + # GH#57224 + df = DataFrame( + { + "a": Series([1, NA], dtype="int64[pyarrow]"), + "b": Series([2, NA], dtype="Int64"), + } + ) + result = df.to_json(orient="records", lines=True) + expected = """{"a":1,"b":2} +{"a":null,"b":null} +""" + assert result == expected From c1b17ae8dcebb7faaa90ea520a7e407321daa594 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 6 Feb 2024 03:28:16 +0100 Subject: [PATCH 090/396] Backport PR #57265 on branch 2.2.x (COMPAT: Numpy 2.0 casting compat) (#57271) Backport PR #57265: COMPAT: Numpy 2.0 casting compat Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/dtypes/cast.py | 1 + pandas/core/internals/blocks.py | 9 ++++++++- pandas/tests/indexing/test_loc.py | 16 ++-------------- pandas/tests/series/test_constructors.py | 12 +++++++++--- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 259e83a5936d7..690af6b0ebdc7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1682,6 +1682,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n arr = np.asarray(arr) if np.issubdtype(arr.dtype, str): + # TODO(numpy-2.0 min): This case will raise an OverflowError above if (casted.astype(str) == arr).all(): return casted raise ValueError(f"string values cannot be losslessly cast to {dtype}") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 70a27300bd60f..259e969112dd7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1425,7 +1425,14 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 casted = casted[0, ...] - values[indexer] = casted + try: + values[indexer] = casted + except (TypeError, ValueError) as err: + if is_list_like(casted): + raise ValueError( + "setting an array element with a sequence." + ) from err + raise return self def putmask( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 952251a58e981..0cd1390d41461 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1235,13 +1235,7 @@ def test_loc_setitem_empty_append_raises(self): with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data - msg = "|".join( - [ - "cannot copy sequence with size 2 to array axis with dimension 0", - r"could not broadcast input array from shape \(2,\) into shape \(0,\)", - "Must have equal len keys and value when setting with an iterable", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data @@ -1575,16 +1569,10 @@ def test_loc_setitem_2d_to_1d_raises(self): # float64 dtype to avoid upcast when trying to set float data ser = Series(range(2), dtype="float64") - msg = "|".join( - [ - r"shape mismatch: value array of shape \(2,2\)", - r"cannot reshape array of size 4 into shape \(2,\)", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): ser.loc[range(2)] = data - msg = r"could not broadcast input array from shape \(2,2\) into shape \(2,?\)" with pytest.raises(ValueError, match=msg): ser.loc[:] = data diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index da069afe5e709..4d3839553a0af 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1958,9 +1958,15 @@ def test_constructor_int64_dtype(self, any_int_dtype): def test_constructor_raise_on_lossy_conversion_of_strings(self): # GH#44923 - with pytest.raises( - ValueError, match="string values cannot be losslessly cast to int8" - ): + if not np_version_gt2: + raises = pytest.raises( + ValueError, match="string values cannot be losslessly cast to int8" + ) + else: + raises = pytest.raises( + OverflowError, match="The elements provided in the data" + ) + with raises: Series(["128"], dtype="int8") def test_constructor_dtype_timedelta_alternative_construct(self): From 45fc954839e79605b14af4b771da56b12b7283cb Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 7 Feb 2024 15:12:00 +0000 Subject: [PATCH 091/396] Backport PR #56945 on branch 2.2.x (ENH: raise ValueError if invalid period freq pass to asfreq when the index of df is a PeriodIndex) (#57292) 'Backport PR #56945: ENH: raise ValueError if invalid period freq pass to asfreq when the index of df is a PeriodIndex' (cherry picked from commit cb97ce6e496e7d1df76f06550c0fe8c4590ff3ce) Co-authored-by: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 32 +++++++----- pandas/core/arrays/period.py | 11 ++-- pandas/plotting/_matplotlib/timeseries.py | 5 +- pandas/tests/dtypes/test_dtypes.py | 10 ++-- .../datetimes/methods/test_to_period.py | 2 +- .../indexes/period/methods/test_asfreq.py | 51 +++++++++++++++++++ pandas/tests/resample/test_period_index.py | 4 +- pandas/tests/scalar/period/test_asfreq.py | 5 +- pandas/tests/scalar/period/test_period.py | 16 +++--- 10 files changed, 96 insertions(+), 41 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 3cc11974b14e5..3f70c72a55a4c 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -35,6 +35,7 @@ Bug fixes - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) +- Fixed bug in :meth:`PeriodIndex.asfreq` which was silently converting frequencies which are not supported as period frequencies instead of raising an error (:issue:`56945`) .. --------------------------------------------------------------------------- .. _whatsnew_221.other: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 764a044f32c82..70e1ca1c4012a 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4271,9 +4271,7 @@ cdef class CustomBusinessDay(BusinessDay): @property def _period_dtype_code(self): # GH#52534 - raise TypeError( - "CustomBusinessDay is not supported as period frequency" - ) + raise ValueError(f"{self.base} is not supported as period frequency") _apply_array = BaseOffset._apply_array @@ -4822,19 +4820,19 @@ cpdef to_offset(freq, bint is_period=False): if freq is None: return None - if isinstance(freq, BaseOffset): - return freq - if isinstance(freq, tuple): raise TypeError( f"to_offset does not support tuples {freq}, pass as a string instead" ) + if isinstance(freq, BaseOffset): + result = freq + elif PyDelta_Check(freq): - return delta_to_tick(freq) + result = delta_to_tick(freq) elif isinstance(freq, str): - delta = None + result = None stride_sign = None try: @@ -4935,21 +4933,27 @@ cpdef to_offset(freq, bint is_period=False): offset = _get_offset(prefix) offset = offset * int(np.fabs(stride) * stride_sign) - if delta is None: - delta = offset + if result is None: + result = offset else: - delta = delta + offset + result = result + offset except (ValueError, TypeError) as err: raise ValueError(INVALID_FREQ_ERR_MSG.format( f"{freq}, failed to parse with error message: {repr(err)}") ) else: - delta = None + result = None - if delta is None: + if result is None: raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) - return delta + if is_period and not hasattr(result, "_period_dtype_code"): + if isinstance(freq, str): + raise ValueError(f"{result.name} is not supported as period frequency") + else: + raise ValueError(f"{freq} is not supported as period frequency") + + return result # ---------------------------------------------------------------------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 28f25d38b2363..d635eb4a25df3 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -733,8 +733,8 @@ def asfreq(self, freq=None, how: str = "E") -> Self: '2015-01'], dtype='period[M]') """ how = libperiod.validate_end_alias(how) - if isinstance(freq, BaseOffset): - freq = freq_to_period_freqstr(freq.n, freq.name) + if isinstance(freq, BaseOffset) and hasattr(freq, "_period_dtype_code"): + freq = PeriodDtype(freq)._freqstr freq = Period._maybe_convert_freq(freq) base1 = self._dtype._dtype_code @@ -1186,12 +1186,7 @@ def dt64arr_to_periodarr( reso = get_unit_from_dtype(data.dtype) freq = Period._maybe_convert_freq(freq) - try: - base = freq._period_dtype_code - except (AttributeError, TypeError): - # AttributeError: _period_dtype_code might not exist - # TypeError: _period_dtype_code might intentionally raise - raise TypeError(f"{freq.name} is not supported as period frequency") + base = freq._period_dtype_code return c_dt64arr_to_periodarr(data.view("i8"), base, tz, reso=reso), freq diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index bf1c0f6346f02..c7ddfa55d0417 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -205,7 +205,10 @@ def _get_ax_freq(ax: Axes): def _get_period_alias(freq: timedelta | BaseOffset | str) -> str | None: - freqstr = to_offset(freq, is_period=True).rule_code + if isinstance(freq, BaseOffset): + freqstr = freq.name + else: + freqstr = to_offset(freq, is_period=True).rule_code return get_period_alias(freqstr) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 0dad0b05303ad..de1ddce724a5b 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -445,12 +445,12 @@ def test_construction(self): def test_cannot_use_custom_businessday(self): # GH#52534 - msg = "CustomBusinessDay is not supported as period frequency" + msg = "C is not supported as period frequency" + msg1 = " is not supported as period frequency" msg2 = r"PeriodDtype\[B\] is deprecated" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - PeriodDtype("C") - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): + PeriodDtype("C") + with pytest.raises(ValueError, match=msg1): with tm.assert_produces_warning(FutureWarning, match=msg2): PeriodDtype(pd.offsets.CustomBusinessDay()) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 00c0216a9b3b5..de8d32f64cde2 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -221,5 +221,5 @@ def test_to_period_offsets_not_supported(self, freq): # GH#56243 msg = f"{freq[1:]} is not supported as period frequency" ts = date_range("1/1/2012", periods=4, freq=freq) - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): ts.to_period() diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index ed078a3e8fb8b..865bae69d91c7 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -1,3 +1,5 @@ +import re + import pytest from pandas import ( @@ -7,6 +9,8 @@ ) import pandas._testing as tm +from pandas.tseries import offsets + class TestPeriodIndex: def test_asfreq(self): @@ -136,3 +140,50 @@ def test_asfreq_with_different_n(self): excepted = Series([1, 2], index=PeriodIndex(["2020-02", "2020-04"], freq="M")) tm.assert_series_equal(result, excepted) + + @pytest.mark.parametrize( + "freq", + [ + "2BMS", + "2YS-MAR", + "2bh", + ], + ) + def test_pi_asfreq_not_supported_frequency(self, freq): + # GH#55785 + msg = f"{freq[1:]} is not supported as period frequency" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + "2BME", + "2YE-MAR", + "2QE", + ], + ) + def test_pi_asfreq_invalid_frequency(self, freq): + # GH#55785 + msg = f"Invalid frequency: {freq}" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + offsets.MonthBegin(2), + offsets.BusinessMonthEnd(2), + ], + ) + def test_pi_asfreq_invalid_baseoffset(self, freq): + # GH#56945 + msg = re.escape(f"{freq} is not supported as period frequency") + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 451d2a83c1d5e..6b7cce7d15a5b 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1040,8 +1040,8 @@ def test_resample_lowercase_frequency_deprecated( offsets.BusinessHour(2), ], ) - def test_asfreq_invalid_period_freq(self, offset, series_and_frame): - # GH#9586 + def test_asfreq_invalid_period_offset(self, offset, series_and_frame): + # GH#55785 msg = f"Invalid offset: '{offset.base}' for converting time series " df = series_and_frame diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 4489c307172d7..73c4d8061c257 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -820,10 +820,9 @@ def test_asfreq_MS(self): assert initial.asfreq(freq="M", how="S") == Period("2013-01", "M") - msg = INVALID_FREQ_ERR_MSG + msg = "MS is not supported as period frequency" with pytest.raises(ValueError, match=msg): initial.asfreq(freq="MS", how="S") - msg = "MonthBegin is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): Period("2013-01", "MS") diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index d819e903a0bae..2c3a0816737fc 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -3,6 +3,7 @@ datetime, timedelta, ) +import re import numpy as np import pytest @@ -40,21 +41,22 @@ class TestPeriodDisallowedFreqs: ) def test_offsets_not_supported(self, freq, freq_msg): # GH#55785 - msg = f"{freq_msg} is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = re.escape(f"{freq} is not supported as period frequency") + with pytest.raises(ValueError, match=msg): Period(year=2014, freq=freq) def test_custom_business_day_freq_raises(self): # GH#52534 - msg = "CustomBusinessDay is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = "C is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq="C") - with pytest.raises(TypeError, match=msg): + msg = f"{offsets.CustomBusinessDay().base} is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq=offsets.CustomBusinessDay()) def test_invalid_frequency_error_message(self): - msg = "WeekOfMonth is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = "WOM-1MON is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2012-01-02", freq="WOM-1MON") def test_invalid_frequency_period_error_message(self): From 11a6136ce2efc25052849ff760b260ca49e371f0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 7 Feb 2024 19:02:06 +0100 Subject: [PATCH 092/396] Backport PR #57233 on branch 2.2.x (REGR: Fix to_numpy conversion for arrow ea with float dtype given) (#57294) Backport PR #57233: REGR: Fix to_numpy conversion for arrow ea with float dtype given Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/_utils.py | 2 ++ pandas/tests/arrays/boolean/test_construction.py | 2 -- pandas/tests/arrays/floating/test_to_numpy.py | 8 ++++---- pandas/tests/arrays/integer/test_dtypes.py | 2 +- pandas/tests/series/methods/test_to_numpy.py | 11 +++++++++++ 6 files changed, 19 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 3f70c72a55a4c..883627bd4b19b 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -25,6 +25,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) +- Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index c75ec7f843ed2..88091a88a4e12 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -39,6 +39,8 @@ def to_numpy_dtype_inference( dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] elif dtype is not None: dtype = np.dtype(dtype) + if na_value is lib.no_default and hasna and dtype.kind == "f": + na_value = np.nan dtype_given = True else: dtype_given = True diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index a5a2dd33940b8..645e763fbf00c 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -308,8 +308,6 @@ def test_to_numpy(box): # converting to int or float without specifying na_value raises with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): arr.to_numpy(dtype="int64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - arr.to_numpy(dtype="float64") def test_to_numpy_copy(): diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py index a25ac40cb3e7c..e954cecba417a 100644 --- a/pandas/tests/arrays/floating/test_to_numpy.py +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -33,10 +33,10 @@ def test_to_numpy_float(box): tm.assert_numpy_array_equal(result, expected) arr = con([0.1, 0.2, None], dtype="Float64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - result = arr.to_numpy(dtype="float64") + result = arr.to_numpy(dtype="float64") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) - # need to explicitly specify na_value result = arr.to_numpy(dtype="float64", na_value=np.nan) expected = np.array([0.1, 0.2, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) @@ -100,7 +100,7 @@ def test_to_numpy_dtype(box, dtype): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) def test_to_numpy_na_raises(box, dtype): con = pd.Series if box else pd.array diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index e3848cdfe3aa9..8620763988e06 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -271,7 +271,7 @@ def test_to_numpy_dtype(dtype, in_series): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int64", "bool"]) def test_to_numpy_na_raises(dtype): a = pd.array([0, 1, None], dtype="Int64") with pytest.raises(ValueError, match=dtype): diff --git a/pandas/tests/series/methods/test_to_numpy.py b/pandas/tests/series/methods/test_to_numpy.py index 5fe3e19b0a20b..8dcc1dd551315 100644 --- a/pandas/tests/series/methods/test_to_numpy.py +++ b/pandas/tests/series/methods/test_to_numpy.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( NA, Series, @@ -23,3 +25,12 @@ def test_to_numpy_cast_before_setting_na(): result = ser.to_numpy(dtype=np.float64, na_value=np.nan) expected = np.array([1.0]) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_to_numpy_arrow_dtype_given(): + # GH#57121 + ser = Series([1, NA], dtype="int64[pyarrow]") + result = ser.to_numpy(dtype="float64") + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) From bbc655d782c2c62ded6b4e305234cbce23ac9f22 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 00:29:56 +0100 Subject: [PATCH 093/396] Backport PR #57121 on branch 2.2.x (REGR: Fix to_numpy for masked array with non-numeric dtype) (#57325) Backport PR #57121: REGR: Fix to_numpy for masked array with non-numeric dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 3 +++ pandas/core/arrays/masked.py | 2 ++ pandas/tests/arrays/masked/test_function.py | 17 +++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 883627bd4b19b..009d4794dbd1d 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -17,12 +17,15 @@ Fixed regressions - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) - Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8c41de9c9fffa..7c49ce5343158 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -501,6 +501,8 @@ def to_numpy( """ hasna = self._hasna dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) + if dtype is None: + dtype = object if hasna: if ( diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index 4c7bd6e293ef4..b259018cd6121 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -5,6 +5,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import BaseMaskedArray arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES] arrays += [ @@ -55,3 +56,19 @@ def test_tolist(data): result = data.tolist() expected = list(data) tm.assert_equal(result, expected) + + +def test_to_numpy(): + # GH#56991 + + class MyStringArray(BaseMaskedArray): + dtype = pd.StringDtype() + _dtype_cls = pd.StringDtype + _internal_fill_value = pd.NA + + arr = MyStringArray( + values=np.array(["a", "b", "c"]), mask=np.array([False, True, False]) + ) + result = arr.to_numpy() + expected = np.array(["a", pd.NA, "c"]) + tm.assert_numpy_array_equal(result, expected) From 361b0899dfd66cdc8eebcfb0ef81285cf126d8a1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 00:30:08 +0100 Subject: [PATCH 094/396] Backport PR #57250 on branch 2.2.x (REGR/DOC: pd.concat should special case DatetimeIndex to sort even when sort=False) (#57326) Backport PR #57250: REGR/DOC: pd.concat should special case DatetimeIndex to sort even when sort=False Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/api.py | 1 + pandas/core/reshape/concat.py | 6 ++++-- pandas/tests/reshape/concat/test_datetimes.py | 12 ++++++------ 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 009d4794dbd1d..69e14a9028dd3 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -15,6 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) +- Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 560285bd57a22..15292953e72d0 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -295,6 +295,7 @@ def _find_common_index_dtype(inds): raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") if len(dtis) == len(indexes): + sort = True result = indexes[0] elif len(dtis) > 1: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index aacea92611697..dc18bb65b35bc 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -205,8 +205,10 @@ def concat( Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. sort : bool, default False - Sort non-concatenation axis if it is not already aligned. - + Sort non-concatenation axis if it is not already aligned. One exception to + this is when the non-concatentation axis is a DatetimeIndex and join='outer' + and the axis is not already aligned. In that case, the non-concatenation + axis is always sorted lexicographically. copy : bool, default True If False, do not copy data unnecessarily. diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 71ddff7438254..4c94dc0d51f7e 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -73,23 +73,23 @@ def test_concat_datetime_timezone(self): exp_idx = DatetimeIndex( [ - "2010-12-31 23:00:00+00:00", - "2011-01-01 00:00:00+00:00", - "2011-01-01 01:00:00+00:00", "2010-12-31 15:00:00+00:00", "2010-12-31 16:00:00+00:00", "2010-12-31 17:00:00+00:00", + "2010-12-31 23:00:00+00:00", + "2011-01-01 00:00:00+00:00", + "2011-01-01 01:00:00+00:00", ] ).as_unit("ns") expected = DataFrame( [ - [1, np.nan], - [2, np.nan], - [3, np.nan], [np.nan, 1], [np.nan, 2], [np.nan, 3], + [1, np.nan], + [2, np.nan], + [3, np.nan], ], index=exp_idx, columns=["a", "b"], From 044342727b2b7efacefa2c2dc485193897c47b0c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 12:25:24 +0100 Subject: [PATCH 095/396] Backport PR #57322 on branch 2.2.x (REGR: Fix astype conversion of ea int to td/dt with missing values) (#57331) Backport PR #57322: REGR: Fix astype conversion of ea int to td/dt with missing values Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/_utils.py | 13 ++++++++++--- pandas/tests/extension/test_arrow.py | 9 +++++++++ pandas/tests/series/methods/test_to_numpy.py | 13 +++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index 88091a88a4e12..6b46396d5efdf 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -39,14 +39,21 @@ def to_numpy_dtype_inference( dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] elif dtype is not None: dtype = np.dtype(dtype) - if na_value is lib.no_default and hasna and dtype.kind == "f": - na_value = np.nan dtype_given = True else: dtype_given = True if na_value is lib.no_default: - na_value = arr.dtype.na_value + if dtype is None or not hasna: + na_value = arr.dtype.na_value + elif dtype.kind == "f": # type: ignore[union-attr] + na_value = np.nan + elif dtype.kind == "M": # type: ignore[union-attr] + na_value = np.datetime64("nat") + elif dtype.kind == "m": # type: ignore[union-attr] + na_value = np.timedelta64("nat") + else: + na_value = arr.dtype.na_value if not dtype_given and hasna: try: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 05a112e464677..e041093bbf5bc 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3305,6 +3305,15 @@ def test_arrow_floordiv_floating_0_divisor(dtype): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["float64", "datetime64[ns]", "timedelta64[ns]"]) +def test_astype_int_with_null_to_numpy_dtype(dtype): + # GH 57093 + ser = pd.Series([1, None], dtype="int64[pyarrow]") + result = ser.astype(dtype) + expected = pd.Series([1, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) def test_arrow_integral_floordiv_large_values(pa_type): # GH 56676 diff --git a/pandas/tests/series/methods/test_to_numpy.py b/pandas/tests/series/methods/test_to_numpy.py index 8dcc1dd551315..4bc7631090761 100644 --- a/pandas/tests/series/methods/test_to_numpy.py +++ b/pandas/tests/series/methods/test_to_numpy.py @@ -6,6 +6,7 @@ from pandas import ( NA, Series, + Timedelta, ) import pandas._testing as tm @@ -34,3 +35,15 @@ def test_to_numpy_arrow_dtype_given(): result = ser.to_numpy(dtype="float64") expected = np.array([1.0, np.nan]) tm.assert_numpy_array_equal(result, expected) + + +def test_astype_ea_int_to_td_ts(): + # GH#57093 + ser = Series([1, None], dtype="Int64") + result = ser.astype("m8[ns]") + expected = Series([1, Timedelta("nat")], dtype="m8[ns]") + tm.assert_series_equal(result, expected) + + result = ser.astype("M8[ns]") + expected = Series([1, Timedelta("nat")], dtype="M8[ns]") + tm.assert_series_equal(result, expected) From 28a7ec7d34be480b9a14e88236500fd74a2e0109 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 16:06:36 +0100 Subject: [PATCH 096/396] Backport PR #57329 on branch 2.2.x (REGR: CategoricalIndex.difference with null values) (#57336) Backport PR #57329: REGR: CategoricalIndex.difference with null values Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/base.py | 7 +++++-- .../tests/indexes/categorical/test_setops.py | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/indexes/categorical/test_setops.py diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 69e14a9028dd3..335dada439029 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6f9078bf5a2a2..4b3d1a9e006dc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3663,9 +3663,12 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex + this = self + if isinstance(self, ABCCategoricalIndex) and self.hasnans and other.hasnans: + this = this.dropna() other = other.unique() - the_diff = self[other.get_indexer_for(self) == -1] - the_diff = the_diff if self.is_unique else the_diff.unique() + the_diff = this[other.get_indexer_for(this) == -1] + the_diff = the_diff if this.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) return the_diff diff --git a/pandas/tests/indexes/categorical/test_setops.py b/pandas/tests/indexes/categorical/test_setops.py new file mode 100644 index 0000000000000..2e87b90efd54c --- /dev/null +++ b/pandas/tests/indexes/categorical/test_setops.py @@ -0,0 +1,18 @@ +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + Index, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("na_value", [None, np.nan]) +def test_difference_with_na(na_value): + # GH 57318 + ci = CategoricalIndex(["a", "b", "c", None]) + other = Index(["c", na_value]) + result = ci.difference(other) + expected = CategoricalIndex(["a", "b"], categories=["a", "b", "c"]) + tm.assert_index_equal(result, expected) From 10b26fefce26e1b2d29d2f7ca39be3b28d875def Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 10 Feb 2024 16:09:36 -0500 Subject: [PATCH 097/396] Backport PR #57333 on branch 2.2.x (REGR: merge with 3rd party EA's can sometimes raise ValueError) (#57337) Backport PR #57333: REGR: merge with 3rd party EA's can sometimes raise ValueError --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/index.pyx | 2 +- pandas/tests/indexes/test_base.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 335dada439029..b342eef850459 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index e4dfe9dec3623..ee6a11ddab004 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -295,7 +295,7 @@ cdef class IndexEngine: values = self.values self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \ self._call_monotonic(values) - except TypeError: + except (TypeError, ValueError): self.monotonic_inc = 0 self.monotonic_dec = 0 is_strict_monotonic = 0 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 666d92064c86c..1fa48f98942c2 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1722,3 +1722,13 @@ def test_nan_comparison_same_object(op): result = op(idx, idx.copy()) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_is_monotonic_pyarrow_list_type(): + # GH 57333 + import pyarrow as pa + + idx = Index([[1], [2, 3]], dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + assert not idx.is_monotonic_increasing + assert not idx.is_monotonic_decreasing From 947f5aeda9d924ce7d93a46457574123bb7fb07a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 22:58:09 +0100 Subject: [PATCH 098/396] Backport PR #57323 on branch 2.2.x (REGR: Fix regression when grouping over a Series) (#57339) Backport PR #57323: REGR: Fix regression when grouping over a Series Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/internals/managers.py | 5 ++--- pandas/tests/copy_view/test_methods.py | 11 +++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index b342eef850459..eb6e50c1be160 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) +- Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3719bf1f77f85..220bb1133dfd5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -12,7 +12,6 @@ cast, ) import warnings -import weakref import numpy as np @@ -282,8 +281,8 @@ def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: Checks if two blocks from two different block managers reference the same underlying values. """ - ref = weakref.ref(self.blocks[blkno]) - return ref in mgr.blocks[blkno].refs.referenced_blocks + blk = self.blocks[blkno] + return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) def get_dtypes(self) -> npt.NDArray[np.object_]: dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 862aebdc70a9d..5d1eefccbb1e7 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -280,6 +280,17 @@ def test_reset_index_series_drop(using_copy_on_write, index): tm.assert_series_equal(ser, ser_orig) +def test_groupby_column_index_in_references(): + df = DataFrame( + {"A": ["a", "b", "c", "d"], "B": [1, 2, 3, 4], "C": ["a", "a", "b", "b"]} + ) + df = df.set_index("A") + key = df["C"] + result = df.groupby(key, observed=True).sum() + expected = df.groupby("C", observed=True).sum() + tm.assert_frame_equal(result, expected) + + def test_rename_columns(using_copy_on_write): # Case: renaming columns returns a new dataframe # + afterwards modifying the result From 58b182b0e823e25081b9a6df809d45b203733d03 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 12 Feb 2024 21:33:18 +0100 Subject: [PATCH 099/396] Backport PR #57341 on branch 2.2.x (REGR: assert_series_equal defaulting to check_exact=True for Index) (#57380) Backport PR #57341: REGR: assert_series_equal defaulting to check_exact=True for Index Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_testing/asserters.py | 3 ++- pandas/tests/util/test_assert_series_equal.py | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index eb6e50c1be160..47ca96c6eef44 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 3de982498e996..41d2a7344a4ed 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -902,6 +902,7 @@ def assert_series_equal( >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True + check_exact_index = False if check_exact is lib.no_default else check_exact if ( check_exact is lib.no_default and rtol is lib.no_default @@ -944,7 +945,7 @@ def assert_series_equal( right.index, exact=check_index_type, check_names=check_names, - check_exact=check_exact, + check_exact=check_exact_index, check_categorical=check_categorical, check_order=not check_like, rtol=rtol, diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 784a0347cf92b..1878e7d838064 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -474,3 +474,11 @@ def test_assert_series_equal_int_tol(): tm.assert_extension_array_equal( left.astype("Int64").values, right.astype("Int64").values, rtol=1.5 ) + + +def test_assert_series_equal_index_exact_default(): + # GH#57067 + ser1 = Series(np.zeros(6, dtype=int), [0, 0.2, 0.4, 0.6, 0.8, 1]) + ser2 = Series(np.zeros(6, dtype=int), np.linspace(0, 1, 6)) + tm.assert_series_equal(ser1, ser2) + tm.assert_frame_equal(ser1.to_frame(), ser2.to_frame()) From 169bb9cf8ca6ee7adbc9e1328e9fff378793ecd0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 12 Feb 2024 21:33:27 +0100 Subject: [PATCH 100/396] Backport PR #57340 on branch 2.2.x (REGR: shift raising for axis=1 and empty df) (#57381) Backport PR #57340: REGR: shift raising for axis=1 and empty df Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/frame.py | 3 +++ pandas/tests/frame/methods/test_shift.py | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 47ca96c6eef44..94e26ff6aa46a 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -25,6 +25,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 734756cb8f7c8..b531ddc418df1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5859,6 +5859,9 @@ def shift( ) fill_value = lib.no_default + if self.empty: + return self.copy() + axis = self._get_axis_number(axis) if is_list_like(periods): diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index b21aa2d687682..abb30595fdcb8 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -756,3 +756,9 @@ def test_shift_with_iterable_check_other_arguments(self): msg = "Cannot specify `suffix` if `periods` is an int." with pytest.raises(ValueError, match=msg): df.shift(1, suffix="fails") + + def test_shift_axis_one_empty(self): + # GH#57301 + df = DataFrame() + result = df.shift(1, axis=1) + tm.assert_frame_equal(result, df) From 09debec463762f209787534dc335b3bb7cd3961f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 12 Feb 2024 22:50:47 +0100 Subject: [PATCH 101/396] Backport PR #57379 on branch 2.2.x (Fix numpy-dev CI warnings) (#57383) Backport PR #57379: Fix numpy-dev CI warnings Co-authored-by: William Ayd --- .../src/vendored/ujson/python/objToJSON.c | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 8bba95dd456de..74ca8ead3d936 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -447,8 +447,15 @@ static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { npyarr->curdim--; npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; npyarr->stridedim -= npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArrayPassThru_iterEnd received a non-array object"); + return; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; + npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); npyarr->dataptr += npyarr->stride; NpyArr_freeItemValue(obj, tc); @@ -467,12 +474,19 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { NpyArr_freeItemValue(obj, tc); - if (PyArray_ISDATETIME(npyarr->array)) { + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNextItem received a non-array object"); + return 0; + } + PyArrayObject *arrayobj = (PyArrayObject *)npyarr->array; + + if (PyArray_ISDATETIME(arrayobj)) { GET_TC(tc)->itemValue = obj; Py_INCREF(obj); - ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); + ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(arrayobj); // Also write the resolution (unit) of the ndarray - PyArray_Descr *dtype = PyArray_DESCR(npyarr->array); + PyArray_Descr *dtype = PyArray_DESCR(arrayobj); ((PyObjectEncoder *)tc->encoder)->valueUnit = get_datetime_metadata_from_dtype(dtype).base; ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; @@ -505,8 +519,15 @@ static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { npyarr->curdim++; npyarr->stridedim += npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNext received a non-array object"); + return 0; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; + + npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); npyarr->index[npyarr->stridedim] = 0; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; @@ -1610,7 +1631,14 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (!values) { goto INVALID; } - pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + + if (!PyArray_Check(pc->newObj)) { + PyErr_SetString(PyExc_TypeError, + "Object_beginTypeContext received a non-array object"); + goto INVALID; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)pc->newObj; + pc->columnLabelsLen = PyArray_DIM(arrayobj, 0); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); if (!pc->columnLabels) { From 3dca4f0f8d583c67d2df56bf9ba51839cdaa6d26 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 14 Feb 2024 03:12:16 +0100 Subject: [PATCH 102/396] Backport PR #57388 on branch 2.2.x (BUG: map(na_action=ignore) not respected for Arrow & masked types) (#57413) Backport PR #57388: BUG: map(na_action=ignore) not respected for Arrow & masked types Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/masked.py | 2 +- pandas/tests/extension/test_arrow.py | 7 +++++++ pandas/tests/extension/test_masked.py | 9 +++++++++ 5 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 94e26ff6aa46a..9733aff0e6eb5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 621a32a049f3b..f8b07bd73d315 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1414,7 +1414,7 @@ def to_numpy( def map(self, mapper, na_action=None): if is_numeric_dtype(self.dtype): - return map_array(self.to_numpy(), mapper, na_action=None) + return map_array(self.to_numpy(), mapper, na_action=na_action) else: return super().map(mapper, na_action) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7c49ce5343158..0bc0d9f8d7a7d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1333,7 +1333,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action=None): - return map_array(self.to_numpy(), mapper, na_action=None) + return map_array(self.to_numpy(), mapper, na_action=na_action) def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e041093bbf5bc..d9a3033b8380e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3379,3 +3379,10 @@ def test_to_numpy_timestamp_to_int(): result = ser.to_numpy(dtype=np.int64) expected = np.array([1577853000000000000]) tm.assert_numpy_array_equal(result, expected) + + +def test_map_numeric_na_action(): + ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") + result = ser.map(lambda x: 42, na_action="ignore") + expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 3efc561d6a125..651f783b44d1f 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -179,6 +179,15 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) + def test_map_na_action_ignore(self, data_missing_for_sorting): + zero = data_missing_for_sorting[2] + result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") + if data_missing_for_sorting.dtype.kind == "b": + expected = np.array([False, pd.NA, False], dtype=object) + else: + expected = np.array([zero, np.nan, zero]) + tm.assert_numpy_array_equal(result, expected) + def _get_expected_exception(self, op_name, obj, other): try: dtype = tm.get_dtype(obj) From 11818adf539b701f7dd68ee46d920d05c52fd7ba Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 16 Feb 2024 18:26:32 +0100 Subject: [PATCH 103/396] Backport PR #57450 on branch 2.2.x (DOC: Set verbose parameter as deprecated in docstring) (#57455) Backport PR #57450: DOC: Set verbose parameter as deprecated in docstring Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> --- pandas/io/parsers/readers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index e26e7e7470461..e04f27b560610 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -240,6 +240,8 @@ performance of reading a large file. verbose : bool, default False Indicate number of ``NA`` values placed in non-numeric columns. + + .. deprecated:: 2.2.0 skip_blank_lines : bool, default True If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \ From 5550bdb9bebe02ec57c54ca75835bbf12e05ceab Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 16 Feb 2024 20:21:45 +0100 Subject: [PATCH 104/396] Backport PR #57402 on branch 2.2.x (BUG: wrong future Warning on string assignment in certain condition) (#57460) Backport PR #57402: BUG: wrong future Warning on string assignment in certain condition Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/dtypes/missing.py | 14 ++++++++++++ pandas/core/indexing.py | 25 +++++++++++++-------- pandas/tests/frame/indexing/test_setitem.py | 16 +++++++++++++ 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 9733aff0e6eb5..5e814ec2a1b92 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.loc` which was unnecessarily throwing "incompatible dtype warning" when expanding with partial row indexer and multiple columns (see `PDEP6 `_) (:issue:`56503`) - Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 4dc0d477f89e8..655a53997620a 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -647,6 +647,20 @@ def infer_fill_value(val): return np.nan +def construct_1d_array_from_inferred_fill_value( + value: object, length: int +) -> ArrayLike: + # Find our empty_value dtype by constructing an array + # from our value and doing a .take on it + from pandas.core.algorithms import take_nd + from pandas.core.construction import sanitize_array + from pandas.core.indexes.base import Index + + arr = sanitize_array(value, Index(range(1)), copy=False) + taker = -1 * np.ones(length, dtype=np.intp) + return take_nd(arr, taker) + + def maybe_fill(arr: np.ndarray) -> np.ndarray: """ Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 934ba3a4d7f29..869e511fc0720 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -57,6 +57,7 @@ ABCSeries, ) from pandas.core.dtypes.missing import ( + construct_1d_array_from_inferred_fill_value, infer_fill_value, is_valid_na_for_dtype, isna, @@ -68,7 +69,6 @@ from pandas.core.construction import ( array as pd_array, extract_array, - sanitize_array, ) from pandas.core.indexers import ( check_array_indexer, @@ -844,7 +844,6 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: if self.ndim != 2: return - orig_key = key if isinstance(key, tuple) and len(key) > 1: # key may be a tuple if we are .loc # if length of key is > 1 set key to column part @@ -862,7 +861,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: keys = self.obj.columns.union(key, sort=False) diff = Index(key).difference(self.obj.columns, sort=False) - if len(diff) and com.is_null_slice(orig_key[0]): + if len(diff): # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B" # is a new column, add the new columns with dtype=np.void # so that later when we go through setitem_single_column @@ -1878,12 +1877,9 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): self.obj[key] = empty_value elif not is_list_like(value): - # Find our empty_value dtype by constructing an array - # from our value and doing a .take on it - arr = sanitize_array(value, Index(range(1)), copy=False) - taker = -1 * np.ones(len(self.obj), dtype=np.intp) - empty_value = algos.take_nd(arr, taker) - self.obj[key] = empty_value + self.obj[key] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) else: # FIXME: GH#42099#issuecomment-864326014 self.obj[key] = infer_fill_value(value) @@ -2165,6 +2161,17 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: else: # set value into the column (first attempting to operate inplace, then # falling back to casting if necessary) + dtype = self.obj.dtypes.iloc[loc] + if dtype == np.void: + # This means we're expanding, with multiple columns, e.g. + # df = pd.DataFrame({'A': [1,2,3], 'B': [4,5,6]}) + # df.loc[df.index <= 2, ['F', 'G']] = (1, 'abc') + # Columns F and G will initially be set to np.void. + # Here, we replace those temporary `np.void` columns with + # columns of the appropriate dtype, based on `value`. + self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) self.obj._mgr.column_setitem(loc, plane_indexer, value) self.obj._clear_item_cache() diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 99233d3cd4cf3..a58dd701f0f22 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1401,3 +1401,19 @@ def test_full_setter_loc_incompatible_dtype(): df.loc[:, "a"] = {0: 3, 1: 4} expected = DataFrame({"a": [3, 4]}) tm.assert_frame_equal(df, expected) + + +def test_setitem_partial_row_multiple_columns(): + # https://github.com/pandas-dev/pandas/issues/56503 + df = DataFrame({"A": [1, 2, 3], "B": [4.0, 5, 6]}) + # should not warn + df.loc[df.index <= 1, ["F", "G"]] = (1, "abc") + expected = DataFrame( + { + "A": [1, 2, 3], + "B": [4.0, 5, 6], + "F": [1.0, 1, float("nan")], + "G": ["abc", "abc", float("nan")], + } + ) + tm.assert_frame_equal(df, expected) From 32d2b9912fc6292778f3d0ba67b928dbcf9e401d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 17 Feb 2024 23:33:12 +0100 Subject: [PATCH 105/396] Backport PR #57311 on branch 2.2.x (Fixing multi method for to_sql for non-oracle databases) (#57466) Backport PR #57311: Fixing multi method for to_sql for non-oracle databases Co-authored-by: Samuel Chai <121340503+kassett@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/generic.py | 3 +++ pandas/io/sql.py | 11 ++++------- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 5e814ec2a1b92..ca4fef4f57fb6 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -31,6 +31,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) +- Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 55693d4cdb753..62dd0c3bf0161 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2969,6 +2969,9 @@ def to_sql( database. Otherwise, the datetimes will be stored as timezone unaware timestamps local to the original timezone. + Not all datastores support ``method="multi"``. Oracle, for example, + does not support multi-value insert. + References ---------- .. [1] https://docs.sqlalchemy.org diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 3a58daf681cfb..195a7c5040853 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1012,22 +1012,19 @@ def _execute_insert(self, conn, keys: list[str], data_iter) -> int: def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: """ - Alternative to _execute_insert for DBs support multivalue INSERT. + Alternative to _execute_insert for DBs support multi-value INSERT. Note: multi-value insert is usually faster for analytics DBs and tables containing a few columns but performance degrades quickly with increase of columns. + """ from sqlalchemy import insert data = [dict(zip(keys, row)) for row in data_iter] - stmt = insert(self.table) - # conn.execute is used here to ensure compatibility with Oracle. - # Using stmt.values(data) would produce a multi row insert that - # isn't supported by Oracle. - # see: https://docs.sqlalchemy.org/en/20/core/dml.html#sqlalchemy.sql.expression.Insert.values - result = conn.execute(stmt, data) + stmt = insert(self.table).values(data) + result = conn.execute(stmt) return result.rowcount def insert_data(self) -> tuple[list[str], list[np.ndarray]]: From ab8541ce51b70a8be728e8a15d78f7a5f82b62de Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 18 Feb 2024 13:35:55 +0100 Subject: [PATCH 106/396] Backport PR #57454 on branch 2.2.x (Release the gil in take for axis=1) (#57484) Backport PR #57454: Release the gil in take for axis=1 Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_libs/algos_take_helper.pxi.in | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 88c3abba506a3..385727fad3c50 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -184,6 +184,17 @@ def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, fv = fill_value + {{if c_type_in == c_type_out != "object"}} + with nogil: + for i in range(n): + for j in range(k): + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + {{else}} for i in range(n): for j in range(k): idx = indexer[j] @@ -195,6 +206,7 @@ def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{else}} out[i, j] = values[i, idx] {{endif}} + {{endif}} @cython.wraparound(False) From b79fe7e19db5931a4f6165e48b04a514d235e1a8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 18 Feb 2024 10:13:56 -0500 Subject: [PATCH 107/396] REGR: DataFrame.update emits spurious warning about downcasting (#57485) --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/frame.py | 13 ++++++++++++- pandas/tests/frame/methods/test_update.py | 12 +++++++----- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index ca4fef4f57fb6..8c8db76ccc3d0 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -32,6 +32,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) +- Fixed regression in :meth:`DataFrame.update` emitting incorrect warnings about downcasting (:issue:`57124`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b531ddc418df1..c09989fe87fb0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8962,6 +8962,7 @@ def update( 1 2 500.0 2 3 6.0 """ + if not PYPY and using_copy_on_write(): if sys.getrefcount(self) <= REF_COUNT: warnings.warn( @@ -9010,7 +9011,17 @@ def update( if mask.all(): continue - self.loc[:, col] = self[col].where(mask, that) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Downcasting behavior", + category=FutureWarning, + ) + # GH#57124 - `that` might get upcasted because of NA values, and then + # downcasted in where because of the mask. Ignoring the warning + # is a stopgap, will replace with a new implementation of update + # in 3.0. + self.loc[:, col] = self[col].where(mask, that) # ---------------------------------------------------------------------- # Data reshaping diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 20ba550beeb30..8af1798aa8e00 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -48,16 +48,18 @@ def test_update(self): def test_update_dtypes(self): # gh 3016 df = DataFrame( - [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], + [[1.0, 2.0, 1, False, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], ) - other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) + other = DataFrame( + [[45, 45, 3, True]], index=[0], columns=["A", "B", "int", "bool1"] + ) df.update(other) expected = DataFrame( - [[45.0, 45.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], + [[45.0, 45.0, 3, True, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], ) tm.assert_frame_equal(df, expected) From dfc66f6e5da81f8e7cfc8ff5b4e557ac34f80a47 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 19 Feb 2024 04:51:13 -0500 Subject: [PATCH 108/396] Backport PR #57474 on branch 2.2.x (REGR: DataFrame.transpose resulting in not contiguous data on nullable EAs) (#57496) Backport PR #57474: REGR: DataFrame.transpose resulting in not contiguous data on nullable EAs --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/masked.py | 17 ++++++++++++++--- pandas/tests/frame/methods/test_transpose.py | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 8c8db76ccc3d0..e1da0af59b3b8 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -32,6 +32,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) +- Fixed regression in :meth:`DataFrame.transpose` with nullable extension dtypes not having F-contiguous data potentially causing exceptions when used (:issue:`57315`) - Fixed regression in :meth:`DataFrame.update` emitting incorrect warnings about downcasting (:issue:`57124`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 0bc0d9f8d7a7d..320d8cb10b8c2 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1621,13 +1621,24 @@ def transpose_homogeneous_masked_arrays( same dtype. The caller is responsible for ensuring validity of input data. """ masked_arrays = list(masked_arrays) + dtype = masked_arrays[0].dtype + values = [arr._data.reshape(1, -1) for arr in masked_arrays] - transposed_values = np.concatenate(values, axis=0) + transposed_values = np.concatenate( + values, + axis=0, + out=np.empty( + (len(masked_arrays), len(masked_arrays[0])), + order="F", + dtype=dtype.numpy_dtype, + ), + ) masks = [arr._mask.reshape(1, -1) for arr in masked_arrays] - transposed_masks = np.concatenate(masks, axis=0) + transposed_masks = np.concatenate( + masks, axis=0, out=np.empty_like(transposed_values, dtype=bool) + ) - dtype = masked_arrays[0].dtype arr_type = dtype.construct_array_type() transposed_arrays: list[BaseMaskedArray] = [] for i in range(transposed_values.shape[1]): diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index d0caa071fae1c..3e74094f266d1 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -3,6 +3,7 @@ import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -190,3 +191,19 @@ def test_transpose_not_inferring_dt_mixed_blocks(self): dtype=object, ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype1", ["Int64", "Float64"]) + @pytest.mark.parametrize("dtype2", ["Int64", "Float64"]) + def test_transpose(self, dtype1, dtype2): + # GH#57315 - transpose should have F contiguous blocks + df = DataFrame( + { + "a": pd.array([1, 1, 2], dtype=dtype1), + "b": pd.array([3, 4, 5], dtype=dtype2), + } + ) + result = df.T + for blk in result._mgr.blocks: + # When dtypes are unequal, we get NumPy object array + data = blk.values._data if dtype1 == dtype2 else blk.values + assert data.flags["F_CONTIGUOUS"] From 25df68eb19c7c783afe276762f3c0a3f03ac8706 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 19 Feb 2024 21:15:56 +0100 Subject: [PATCH 109/396] Backport PR #57486 on branch 2.2.x (CI: Run excel tests on single cpu for windows) (#57507) Backport PR #57486: CI: Run excel tests on single cpu for windows Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/io/excel/test_odf.py | 5 +++++ pandas/tests/io/excel/test_odswriter.py | 5 +++++ pandas/tests/io/excel/test_openpyxl.py | 5 +++++ pandas/tests/io/excel/test_readers.py | 4 ++++ pandas/tests/io/excel/test_style.py | 4 ++++ pandas/tests/io/excel/test_writers.py | 4 ++++ pandas/tests/io/excel/test_xlrd.py | 5 +++++ pandas/tests/io/excel/test_xlsxwriter.py | 5 +++++ 8 files changed, 37 insertions(+) diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index f01827fa4ca2f..b5bb9b27258d8 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -3,11 +3,16 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm pytest.importorskip("odf") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture(autouse=True) def cd_and_set_engine(monkeypatch, datapath): diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py index 271353a173d2a..1c728ad801bc1 100644 --- a/pandas/tests/io/excel/test_odswriter.py +++ b/pandas/tests/io/excel/test_odswriter.py @@ -6,6 +6,8 @@ import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm @@ -13,6 +15,9 @@ odf = pytest.importorskip("odf") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 2df9ec9e53516..e53b5830ec6a4 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -17,6 +19,9 @@ openpyxl = pytest.importorskip("openpyxl") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 15712f36da4ca..8da8535952dcf 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -18,6 +18,7 @@ from pandas._config import using_pyarrow_string_dtype +from pandas.compat import is_platform_windows import pandas.util._test_decorators as td import pandas as pd @@ -34,6 +35,9 @@ StringArray, ) +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ # Add any engines to test here diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 3ca8637885639..89615172688d7 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows import pandas.util._test_decorators as td from pandas import ( @@ -20,6 +21,9 @@ # could compute styles and render to excel without jinja2, since there is no # 'template' file, but this needs the import error to delayed until render time. +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + def assert_equal_cell_styles(cell1, cell2): # TODO: should find a better way to check equality diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 8c003723c1c71..292eab2d88152 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -11,6 +11,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows from pandas.compat._constants import PY310 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -34,6 +35,9 @@ ) from pandas.io.excel._util import _writers +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + def get_exp_unit(path: str) -> str: return "ns" diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 6d5008ca9ee68..066393d91eead 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm @@ -11,6 +13,9 @@ xlrd = pytest.importorskip("xlrd") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture(params=[".xls"]) def read_ext_xlrd(request): diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index 94f6bdfaf069c..529367761fc02 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -2,6 +2,8 @@ import pytest +from pandas.compat import is_platform_windows + from pandas import DataFrame import pandas._testing as tm @@ -9,6 +11,9 @@ xlsxwriter = pytest.importorskip("xlsxwriter") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): From 6766c92fbac05118374c3963ac4b59524d80cf82 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 19 Feb 2024 23:12:25 +0100 Subject: [PATCH 110/396] Backport PR #57490 on branch 2.2.x (DOC: Add a few deprecation notes) (#57509) Backport PR #57490: DOC: Add a few deprecation notes Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/dtypes/common.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2245359fd8eac..df0251d141984 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -169,6 +169,9 @@ def is_sparse(arr) -> bool: """ Check whether an array-like is a 1-D pandas sparse array. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.SparseDtype) instead. + Check that the one-dimensional array-like is a pandas sparse array. Returns True if it is a pandas sparse array, not another type of sparse array. @@ -295,6 +298,9 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of a DatetimeTZDtype dtype. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.DatetimeTZDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -381,6 +387,9 @@ def is_period_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Period dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.Period) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -424,6 +433,9 @@ def is_interval_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Interval dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.IntervalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -470,6 +482,9 @@ def is_categorical_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Categorical dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.CategoricalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype From 2ae7a1057a1c64b4cddf94aa5fa439f5a14ea5d1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 20 Feb 2024 02:20:20 +0100 Subject: [PATCH 111/396] Backport PR #57488 on branch 2.2.x (REGR: query raising for all NaT in object column) (#57515) Backport PR #57488: REGR: query raising for all NaT in object column Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/generic.py | 2 +- pandas/tests/frame/test_query_eval.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index e1da0af59b3b8..6c6a37b2b2f89 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -27,6 +27,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.loc` which was unnecessarily throwing "incompatible dtype warning" when expanding with partial row indexer and multiple columns (see `PDEP6 `_) (:issue:`56503`) - Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.query` with all ``NaT`` column with object dtype (:issue:`57068`) - Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 62dd0c3bf0161..0ec17173287f5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -657,7 +657,7 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: return { clean_column_name(k): Series( - v, copy=False, index=self.index, name=k + v, copy=False, index=self.index, name=k, dtype=self.dtypes[k] ).__finalize__(self) for k, v in zip(self.columns, self._iter_column_arrays()) if not isinstance(k, int) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index a498296e09c52..2c807c72582c5 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1415,3 +1415,11 @@ def test_query_ea_equality_comparison(self, dtype, engine): } ) tm.assert_frame_equal(result, expected) + + def test_all_nat_in_object(self): + # GH#57068 + now = pd.Timestamp.now("UTC") # noqa: F841 + df = DataFrame({"a": pd.to_datetime([None, None], utc=True)}, dtype=object) + result = df.query("a > @now") + expected = DataFrame({"a": []}, dtype=object) + tm.assert_frame_equal(result, expected) From 9b1ce062d3985e7556fc7fa6f9d4c6bc184576eb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 20 Feb 2024 19:51:27 +0100 Subject: [PATCH 112/396] Backport PR #57489 on branch 2.2.x (REGR: astype introducing decimals when casting from int with na to string) (#57533) Backport PR #57489: REGR: astype introducing decimals when casting from int with na to string Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/lib.pyx | 2 +- pandas/tests/series/methods/test_astype.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 6c6a37b2b2f89..85b9346aa01a5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -39,6 +39,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) +- Fixed regression in :meth:`Series.astype` introducing decimals when converting from integer with missing values to string dtype (:issue:`57418`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) - Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c483f35513a40..7656e8d986117 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -759,7 +759,7 @@ cpdef ndarray[object] ensure_string_array( out = arr.astype(str).astype(object) out[arr.isna()] = na_value return out - arr = arr.to_numpy() + arr = arr.to_numpy(dtype=object) elif not util.is_array(arr): arr = np.array(arr, dtype="object") diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 46f55fff91e41..4c8028e74ee55 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -673,3 +673,11 @@ def test_astype_timedelta64_with_np_nan(self): result = Series([Timedelta(1), np.nan], dtype="timedelta64[ns]") expected = Series([Timedelta(1), NaT], dtype="timedelta64[ns]") tm.assert_series_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_astype_int_na_string(self): + # GH#57418 + ser = Series([12, NA], dtype="Int64[pyarrow]") + result = ser.astype("string[pyarrow]") + expected = Series(["12", NA], dtype="string[pyarrow]") + tm.assert_series_equal(result, expected) From 0b49cf35bdc025e69ef104d59660cb08b973fec5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 20 Feb 2024 22:32:29 +0100 Subject: [PATCH 113/396] Backport PR #57536 on branch 2.2.x (BUG: dt64 + DateOffset with milliseconds) (#57537) Backport PR #57536: BUG: dt64 + DateOffset with milliseconds Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 15 +++++++++- pandas/tests/arithmetic/test_datetime64.py | 32 ++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 85b9346aa01a5..ab62a7c7598d5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -42,6 +42,7 @@ Fixed regressions - Fixed regression in :meth:`Series.astype` introducing decimals when converting from integer with missing values to string dtype (:issue:`57418`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) - Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) +- Fixed regression in addition or subtraction of :class:`DateOffset` objects with millisecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` (:issue:`57529`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 70e1ca1c4012a..c37a4b285daef 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1458,13 +1458,22 @@ cdef class RelativeDeltaOffset(BaseOffset): "minutes", "seconds", "microseconds", + "milliseconds", } # relativedelta/_offset path only valid for base DateOffset if self._use_relativedelta and set(kwds).issubset(relativedelta_fast): + td_args = { + "days", + "hours", + "minutes", + "seconds", + "microseconds", + "milliseconds" + } td_kwds = { key: val for key, val in kwds.items() - if key in ["days", "hours", "minutes", "seconds", "microseconds"] + if key in td_args } if "weeks" in kwds: days = td_kwds.get("days", 0) @@ -1474,6 +1483,8 @@ cdef class RelativeDeltaOffset(BaseOffset): delta = Timedelta(**td_kwds) if "microseconds" in kwds: delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") else: delta = delta.as_unit("s") else: @@ -1491,6 +1502,8 @@ cdef class RelativeDeltaOffset(BaseOffset): delta = Timedelta(self._offset * self.n) if "microseconds" in kwds: delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") else: delta = delta.as_unit("s") return delta diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index dbff88dc6f4f6..a468449efd507 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1586,6 +1586,38 @@ def test_dti_add_sub_nonzero_mth_offset( expected = tm.box_expected(expected, box_with_array, False) tm.assert_equal(result, expected) + def test_dt64arr_series_add_DateOffset_with_milli(self): + # GH 57529 + dti = DatetimeIndex( + [ + "2000-01-01 00:00:00.012345678", + "2000-01-31 00:00:00.012345678", + "2000-02-29 00:00:00.012345678", + ], + dtype="datetime64[ns]", + ) + result = dti + DateOffset(milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-01 00:00:00.016345678", + "2000-01-31 00:00:00.016345678", + "2000-02-29 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + + result = dti + DateOffset(days=1, milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-02 00:00:00.016345678", + "2000-02-01 00:00:00.016345678", + "2000-03-01 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + class TestDatetime64OverflowHandling: # TODO: box + de-duplicate From c101d3022b0a6b3a2375db64111c3f6fe516acf3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 20 Feb 2024 22:32:46 +0100 Subject: [PATCH 114/396] Backport PR #57510 on branch 2.2.x (DOC: Fix xarray example) (#57538) DOC: Fix xarray example (#57510) * DOC: Fix xarray example * Update * Skip doctest * Igore another doctest --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> (cherry picked from commit 33a1cd7163ce712e5a38fdb5d2e04de203b3ddf9) --- pandas/core/generic.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0ec17173287f5..1dd471a09f1b1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3278,18 +3278,18 @@ def to_xarray(self): 2 lion mammal 80.5 4 3 monkey mammal NaN 4 - >>> df.to_xarray() + >>> df.to_xarray() # doctest: +SKIP Dimensions: (index: 4) Coordinates: - * index (index) int64 0 1 2 3 + * index (index) int64 32B 0 1 2 3 Data variables: - name (index) object 'falcon' 'parrot' 'lion' 'monkey' - class (index) object 'bird' 'bird' 'mammal' 'mammal' - max_speed (index) float64 389.0 24.0 80.5 nan - num_legs (index) int64 2 2 4 4 + name (index) object 32B 'falcon' 'parrot' 'lion' 'monkey' + class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' + max_speed (index) float64 32B 389.0 24.0 80.5 nan + num_legs (index) int64 32B 2 2 4 4 - >>> df['max_speed'].to_xarray() + >>> df['max_speed'].to_xarray() # doctest: +SKIP array([389. , 24. , 80.5, nan]) Coordinates: @@ -3311,7 +3311,7 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' 2018-01-02 falcon 361 parrot 15 - >>> df_multiindex.to_xarray() + >>> df_multiindex.to_xarray() # doctest: +SKIP Dimensions: (date: 2, animal: 2) Coordinates: From 3a4033c7296a665fcbbbcd49a05a420611fa3f3c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 21 Feb 2024 21:46:10 +0100 Subject: [PATCH 115/396] Backport PR #57439 on branch 2.2.x (BUG: read_json returning Index instead of RangeIndex) (#57552) Backport PR #57439: BUG: read_json returning Index instead of RangeIndex Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/io/json/_json.py | 25 ++++++++++++++----------- pandas/tests/io/json/test_pandas.py | 16 ++++++++++++++-- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index ab62a7c7598d5..d81032fafa730 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) +- Fixed regression in :func:`read_json` where an :class:`Index` would be returned instead of a :class:`RangeIndex` (:issue:`57429`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 4c490c6b2cda2..9414f45215029 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1266,6 +1266,7 @@ def _try_convert_data( if result: return new_data, True + converted = False if self.dtype_backend is not lib.no_default and not is_axis: # Fall through for conversion later on return data, True @@ -1273,16 +1274,17 @@ def _try_convert_data( # try float try: data = data.astype("float64") + converted = True except (TypeError, ValueError): pass - if data.dtype.kind == "f": - if data.dtype != "float64": - # coerce floats to 64 - try: - data = data.astype("float64") - except (TypeError, ValueError): - pass + if data.dtype.kind == "f" and data.dtype != "float64": + # coerce floats to 64 + try: + data = data.astype("float64") + converted = True + except (TypeError, ValueError): + pass # don't coerce 0-len data if len(data) and data.dtype in ("float", "object"): @@ -1291,14 +1293,15 @@ def _try_convert_data( new_data = data.astype("int64") if (new_data == data).all(): data = new_data + converted = True except (TypeError, ValueError, OverflowError): pass - # coerce ints to 64 - if data.dtype == "int": - # coerce floats to 64 + if data.dtype == "int" and data.dtype != "int64": + # coerce ints to 64 try: data = data.astype("int64") + converted = True except (TypeError, ValueError): pass @@ -1307,7 +1310,7 @@ def _try_convert_data( if self.orient == "split": return data, False - return data, True + return data, converted @final def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index caa25841d3596..5279f3f1cdfbe 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -24,6 +24,7 @@ DataFrame, DatetimeIndex, Index, + RangeIndex, Series, Timestamp, date_range, @@ -493,12 +494,12 @@ def test_frame_mixedtype_orient(self): # GH10289 left = read_json(inp, orient=orient, convert_axes=False) tm.assert_frame_equal(left, right) - right.index = pd.RangeIndex(len(df)) + right.index = RangeIndex(len(df)) inp = StringIO(df.to_json(orient="records")) left = read_json(inp, orient="records", convert_axes=False) tm.assert_frame_equal(left, right) - right.columns = pd.RangeIndex(df.shape[1]) + right.columns = RangeIndex(df.shape[1]) inp = StringIO(df.to_json(orient="values")) left = read_json(inp, orient="values", convert_axes=False) tm.assert_frame_equal(left, right) @@ -2188,3 +2189,14 @@ def test_to_json_ea_null(): {"a":null,"b":null} """ assert result == expected + + +def test_read_json_lines_rangeindex(): + # GH 57429 + data = """ +{"a": 1, "b": 2} +{"a": 3, "b": 4} +""" + result = read_json(StringIO(data), lines=True).index + expected = RangeIndex(2) + tm.assert_index_equal(result, expected, exact=True) From 3bfedfef7401f6af7e551b9fd2182442b70f0409 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 21 Feb 2024 13:49:28 -1000 Subject: [PATCH 116/396] Backport PR #57551: BLD: Add pyarrow extra for pip installation (#57557) --- .github/workflows/package-checks.yml | 2 +- doc/source/whatsnew/v2.2.1.rst | 6 ++++++ pyproject.toml | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index d59ddf272f705..7c1da5678a2aa 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] + extra: ["test", "pyarrow", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] fail-fast: false name: Install Extras - ${{ matrix.extra }} concurrency: diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index d81032fafa730..e381f9df16383 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -9,6 +9,12 @@ including other versions of pandas. {{ header }} .. --------------------------------------------------------------------------- +.. _whatsnew_221.enhancements: + +Enhancements +~~~~~~~~~~~~ +- Added ``pyarrow`` pip extra so users can install pandas and pyarrow with pip with ``pip install pandas[pyarrow]`` (:issue:`54466`) + .. _whatsnew_221.regressions: Fixed regressions diff --git a/pyproject.toml b/pyproject.toml index 8c9a79aa2b059..c225ed80dcb10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] +pyarrow = ['pyarrow>=10.0.1'] performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4'] computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] fss = ['fsspec>=2022.11.0'] From ea56e0cd720b8a4e1c26df7387e8ccb4badf6eb2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 22 Feb 2024 19:45:36 +0100 Subject: [PATCH 117/396] Backport PR #57556 on branch 2.2.x (Remove PyArrow deprecation warning) (#57568) --- .github/workflows/unit-tests.yml | 5 +---- doc/source/whatsnew/v2.2.1.rst | 10 ++++++++++ pandas/__init__.py | 31 +------------------------------ pandas/compat/pyarrow.py | 2 -- pandas/tests/test_common.py | 25 ------------------------- 5 files changed, 12 insertions(+), 61 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 4c7aa1e1e49ee..8736674bbf965 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -92,10 +92,7 @@ jobs: - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" - # Currently restricted the warnings that error to Deprecation Warnings from numpy - # done since pyarrow isn't compatible with numpydev always - # TODO: work with pyarrow to revert this? - test_args: "-W error::DeprecationWarning:numpy -W error::FutureWarning:numpy" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index e381f9df16383..35d64c99f2002 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -67,6 +67,16 @@ Bug fixes Other ~~~~~ + +.. note:: + + The ``DeprecationWarning`` that was raised when pandas was imported without PyArrow being + installed has been removed. This decision was made because the warning was too noisy for too + many users and a lot of feedback was collected about the decision to make PyArrow a required + dependency. Pandas is currently considering the decision whether or not PyArrow should be added + as a hard dependency in 3.0. Interested users can follow the discussion + `here `_. + - Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`) - Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`) diff --git a/pandas/__init__.py b/pandas/__init__.py index ed524c2bb3619..ca2eba2043292 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -203,36 +203,7 @@ stacklevel=2, ) -# DeprecationWarning for missing pyarrow -from pandas.compat.pyarrow import pa_version_under10p1, pa_not_found - -if pa_version_under10p1: - # pyarrow is either too old or nonexistent, warn - from pandas.compat._optional import VERSIONS - - if pa_not_found: - pa_msg = "was not found to be installed on your system." - else: - pa_msg = ( - f"was too old on your system - pyarrow {VERSIONS['pyarrow']} " - "is the current minimum supported version as of this release." - ) - - warnings.warn( - f""" -Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), -(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) -but {pa_msg} -If this would cause problems for you, -please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 - """, # noqa: E501 - DeprecationWarning, - stacklevel=2, - ) - del VERSIONS, pa_msg - -# Delete all unnecessary imported modules -del pa_version_under10p1, pa_not_found, warnings, os +del warnings, os # module level doc-string __doc__ = """ diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 2e151123ef2c9..beb4814914101 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -8,7 +8,6 @@ import pyarrow as pa _palv = Version(Version(pa.__version__).base_version) - pa_not_found = False pa_version_under10p1 = _palv < Version("10.0.1") pa_version_under11p0 = _palv < Version("11.0.0") pa_version_under12p0 = _palv < Version("12.0.0") @@ -17,7 +16,6 @@ pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") except ImportError: - pa_not_found = True pa_version_under10p1 = True pa_version_under11p0 = True pa_version_under12p0 = True diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 4af71be11fe6b..e8a1c961c8cb6 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -8,8 +8,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import Series import pandas._testing as tm @@ -267,26 +265,3 @@ def test_bz2_missing_import(): code = textwrap.dedent(code) call = [sys.executable, "-c", code] subprocess.check_output(call) - - -@td.skip_if_installed("pyarrow") -@pytest.mark.parametrize("module", ["pandas", "pandas.arrays"]) -def test_pyarrow_missing_warn(module): - # GH56896 - response = subprocess.run( - [sys.executable, "-c", f"import {module}"], - capture_output=True, - check=True, - ) - msg = """ -Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), -(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) -but was not found to be installed on your system. -If this would cause problems for you, -please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 -""" # noqa: E501 - stderr_msg = response.stderr.decode("utf-8") - # Split by \n to avoid \r\n vs \n differences on Windows/Unix - # https://stackoverflow.com/questions/11989501/replacing-r-n-with-n - stderr_msg = "\n".join(stderr_msg.splitlines()) - assert msg in stderr_msg From 5521dc98f9fe73cc3a0adbbf61a2e15f3ae0736e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 23 Feb 2024 01:38:10 +0100 Subject: [PATCH 118/396] Backport PR #57314 on branch 2.2.x (BUG: Fix near-minimum timestamp handling) (#57573) Backport PR #57314: BUG: Fix near-minimum timestamp handling Co-authored-by: Robert Schmidtke --- doc/source/whatsnew/v2.2.1.rst | 1 + .../src/vendored/numpy/datetime/np_datetime.c | 18 ++++++++++++++---- pandas/tests/tslibs/test_array_to_datetime.py | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 35d64c99f2002..fa5304c1c3b56 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -21,6 +21,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) +- Fixed regression causing overflow for near-minimum timestamps (:issue:`57150`) - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 06e3251db8315..277d01807f2f3 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -482,10 +482,20 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, if (base == NPY_FR_ns) { int64_t nanoseconds; - PD_CHECK_OVERFLOW( - scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); - PD_CHECK_OVERFLOW( - checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + + // Minimum valid timestamp in nanoseconds (1677-09-21 00:12:43.145224193). + const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; + if (microseconds == min_nanoseconds / 1000 - 1) { + // For values within one microsecond of min_nanoseconds, use it as base + // and offset it with nanosecond delta to avoid overflow during scaling. + PD_CHECK_OVERFLOW(checked_int64_add( + min_nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); + } else { + PD_CHECK_OVERFLOW( + scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + } return nanoseconds; } diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 632d3b4cc3c84..82175c67764f8 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -296,6 +296,23 @@ def test_to_datetime_barely_out_of_bounds(): tslib.array_to_datetime(arr) +@pytest.mark.parametrize( + "timestamp", + [ + # Close enough to bounds that scaling micros to nanos overflows + # but adding nanos would result in an in-bounds datetime. + "1677-09-21T00:12:43.145224193", + "1677-09-21T00:12:43.145224999", + # this always worked + "1677-09-21T00:12:43.145225000", + ], +) +def test_to_datetime_barely_inside_bounds(timestamp): + # see gh-57150 + result, _ = tslib.array_to_datetime(np.array([timestamp], dtype=object)) + tm.assert_numpy_array_equal(result, np.array([timestamp], dtype="M8[ns]")) + + class SubDatetime(datetime): pass From bdbb17993cacf916648c1f3e0aa637f459346a4f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 23 Feb 2024 05:47:35 +0100 Subject: [PATCH 119/396] Backport PR #57576 on branch 2.2.x (DOC: Add release date for 2.2.1) (#57579) Backport PR #57576: DOC: Add release date for 2.2.1 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index fa5304c1c3b56..f4ed594613349 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -1,6 +1,6 @@ .. _whatsnew_221: -What's new in 2.2.1 (February XX, 2024) +What's new in 2.2.1 (February 22, 2024) --------------------------------------- These are the changes in pandas 2.2.1. See :ref:`release` for a full changelog From 541448e41cefbff27e45866ad173826fd65e807e Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 23 Feb 2024 08:36:39 -0500 Subject: [PATCH 120/396] RLS: 2.2.1 From 470b886470f202e578c77952db8163e49f3efeab Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 23 Feb 2024 15:48:32 +0100 Subject: [PATCH 121/396] Backport PR #57582 on branch 2.2.x (DOC: Add contributors for 2.2.1) (#57583) Backport PR #57582: DOC: Add contributors for 2.2.1 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- doc/source/whatsnew/v2.2.1.rst | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d9ab0452c8334..e015afb17dce5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -946,4 +946,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.1.4..v2.2.0|HEAD +.. contributors:: v2.1.4..v2.2.0 diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index f4ed594613349..310dd921e44f6 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -86,3 +86,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.2.0..v2.2.1|HEAD From bdc79c146c2e32f2cab629be240f01658cfb6cc2 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 23 Feb 2024 09:49:27 -0500 Subject: [PATCH 122/396] RLS: 2.2.1 From 9a0718468d67608f25dd56c3d912e43950da154a Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 1 Mar 2024 15:36:48 +0000 Subject: [PATCH 123/396] Backport PR #57689 on branch 2.2.x (CI: fix ci (calamine typing)) (#57692) Backport PR #57689: CI: fix ci (calamine typing) --- pandas/io/excel/_calamine.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py index 4f65acf1aa40e..5259469f7a569 100644 --- a/pandas/io/excel/_calamine.py +++ b/pandas/io/excel/_calamine.py @@ -74,9 +74,7 @@ def load_workbook( ) -> CalamineWorkbook: from python_calamine import load_workbook - return load_workbook( - filepath_or_buffer, **engine_kwargs # type: ignore[arg-type] - ) + return load_workbook(filepath_or_buffer, **engine_kwargs) @property def sheet_names(self) -> list[str]: From 4ac5ee21a073a93fbf47b1e95ea1637a02d870da Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 3 Mar 2024 13:54:00 +0100 Subject: [PATCH 124/396] Backport PR #57668 on branch 2.2.x (CLN: More numpy 2 stuff) (#57707) Backport PR #57668: CLN: More numpy 2 stuff Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/_libs/src/vendored/ujson/python/objToJSON.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 74ca8ead3d936..fa91db5fe34e3 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -74,7 +74,6 @@ typedef struct __NpyArrContext { npy_intp ndim; npy_intp index[NPY_MAXDIMS]; int type_num; - PyArray_GetItemFunc *getitem; char **rowLabels; char **columnLabels; @@ -405,7 +404,6 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { } npyarr->array = (PyObject *)obj; - npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem; npyarr->dataptr = PyArray_DATA(obj); npyarr->ndim = PyArray_NDIM(obj) - 1; npyarr->curdim = 0; @@ -492,7 +490,7 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; } else { - GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + GET_TC(tc)->itemValue = PyArray_GETITEM(arrayobj, npyarr->dataptr); } npyarr->dataptr += npyarr->stride; From 6db283c367ab178e5464bb5ff323bdc6c9ebce1f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 4 Mar 2024 12:45:38 -1000 Subject: [PATCH 125/396] Backport PR #57726: TST/CI: Fix test_repr on musl for dateutil 2.9 (#57728) --- pandas/tests/io/pytables/test_timezones.py | 2 +- pandas/tests/scalar/timestamp/test_formats.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 8c61830ebe038..c5613daf62207 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -104,7 +104,7 @@ def test_append_with_timezones(setup_path, gettz): msg = ( r"invalid info for \[values_block_1\] for \[tz\], " - r"existing_value \[(dateutil/.*)?US/Eastern\] " + r"existing_value \[(dateutil/.*)?(US/Eastern|America/New_York)\] " r"conflicts with new value \[(dateutil/.*)?EET\]" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index d7160597ea6d6..e7ebcccef1c86 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -88,7 +88,7 @@ def test_isoformat(ts, timespec, expected_iso): class TestTimestampRendering: - timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] + timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/America/Los_Angeles"] @pytest.mark.parametrize("tz", timezones) @pytest.mark.parametrize("freq", ["D", "M", "S", "N"]) From 3cc5afa53e123a89e594df87630f9ac61e718a09 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 4 Mar 2024 23:46:47 +0100 Subject: [PATCH 126/396] Backport PR #57721 on branch 2.2.x (update from 2022 to 2024 image) (#57729) Backport PR #57721: update from 2022 to 2024 image Co-authored-by: Thomas Baumann --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 90afb1ce29684..ea93575ac9430 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,7 +3,7 @@ version: 2.1 jobs: test-arm: machine: - image: ubuntu-2004:2022.04.1 + image: default resource_class: arm.large environment: ENV_FILE: ci/deps/circle-310-arm64.yaml @@ -46,7 +46,7 @@ jobs: cibw-build: type: string machine: - image: ubuntu-2004:2022.04.1 + image: default resource_class: arm.large environment: TRIGGER_SOURCE: << pipeline.trigger_source >> From 301f9145b26f70c18e2a34c0e2fd1a346ba413d9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Mar 2024 14:08:17 -1000 Subject: [PATCH 127/396] Backport PR #57172: MAINT: Adjust the codebase to the new 's keyword meaning (#57740) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mateusz Sokół <8431159+mtsokol@users.noreply.github.com> --- pandas/core/array_algos/quantile.py | 6 ++--- pandas/core/arrays/arrow/array.py | 4 +++- pandas/core/arrays/base.py | 5 +++- pandas/core/arrays/categorical.py | 4 +++- pandas/core/arrays/datetimelike.py | 4 +++- pandas/core/arrays/datetimes.py | 6 ++--- pandas/core/arrays/interval.py | 4 +++- pandas/core/arrays/masked.py | 4 +++- pandas/core/arrays/numeric.py | 14 +++++++---- pandas/core/arrays/numpy_.py | 4 +++- pandas/core/arrays/period.py | 9 ++++++-- pandas/core/arrays/sparse/array.py | 4 +++- pandas/core/arrays/timedeltas.py | 7 ++++-- pandas/core/construction.py | 14 ++++++++--- pandas/core/dtypes/cast.py | 7 ++++-- pandas/core/dtypes/missing.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 4 +++- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/multi.py | 6 ++--- pandas/core/internals/managers.py | 2 ++ pandas/core/series.py | 7 +++++- pandas/io/pytables.py | 2 +- .../tests/arrays/integer/test_arithmetic.py | 1 + pandas/tests/arrays/test_datetimelike.py | 23 +++++++++++-------- pandas/tests/dtypes/test_inference.py | 4 ++-- .../tests/extension/array_with_attr/array.py | 5 +++- pandas/tests/extension/json/array.py | 8 ++++--- pandas/tests/extension/list/array.py | 5 +++- pandas/tests/extension/test_common.py | 8 ++++--- .../tests/frame/methods/test_select_dtypes.py | 2 +- pandas/tests/frame/test_arithmetic.py | 2 +- pandas/tests/indexes/test_index_new.py | 2 +- 33 files changed, 125 insertions(+), 58 deletions(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index ee6f00b219a15..5c933294fb944 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -102,7 +102,7 @@ def quantile_with_mask( interpolation=interpolation, ) - result = np.array(result, copy=False) + result = np.asarray(result) result = result.T return result @@ -201,9 +201,9 @@ def _nanpercentile( ] if values.dtype.kind == "f": # preserve itemsize - result = np.array(result, dtype=values.dtype, copy=False).T + result = np.asarray(result, dtype=values.dtype).T else: - result = np.array(result, copy=False).T + result = np.asarray(result).T if ( result.dtype != values.dtype and not mask.all() diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f8b07bd73d315..f2b8aa75ca5bf 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -656,7 +656,9 @@ def __arrow_array__(self, type=None): """Convert myself to a pyarrow ChunkedArray.""" return self._pa_array - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" return self.to_numpy(dtype=dtype) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ea0e2e54e3339..abfe2369b0d0d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -719,7 +719,10 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy) - return np.array(self, dtype=dtype, copy=copy) + if not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype, copy=copy) def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b87c5375856dc..f191f7277743f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1636,7 +1636,9 @@ def _validate_codes_for_dtype(cls, codes, *, dtype: CategoricalDtype) -> np.ndar # ------------------------------------------------------------- @ravel_compat - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ The numpy array interface. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a0e0a1434e871..1042a1b3fde61 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -351,7 +351,9 @@ def _formatter(self, boxed: bool = False): # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): return np.array(list(self), dtype=object) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 02656b655a0c6..a146220d249e2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -635,12 +635,12 @@ def _resolution_obj(self) -> Resolution: # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: if dtype is None and self.tz: # The default for tz-aware is object, to preserve tz info dtype = object - return super().__array__(dtype=dtype) + return super().__array__(dtype=dtype, copy=copy) def __iter__(self) -> Iterator: """ @@ -2393,7 +2393,7 @@ def objects_to_datetime64( assert errors in ["raise", "ignore", "coerce"] # if str-dtype, convert - data = np.array(data, copy=False, dtype=np.object_) + data = np.asarray(data, dtype=np.object_) result, tz_parsed = tslib.array_to_datetime( data, diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index e69f996441703..91db7f11bcbe0 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1567,7 +1567,9 @@ def is_non_overlapping_monotonic(self) -> bool: # --------------------------------------------------------------------- # Conversion - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 320d8cb10b8c2..d7e816b9d3781 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -593,7 +593,9 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ the array interface, return my values We return an object array here to preserve our scalar values diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 210450e868698..68fa7fcb6573c 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -159,7 +159,10 @@ def _coerce_to_data_and_mask( return values, mask, dtype, inferred_type original = values - values = np.array(values, copy=copy) + if not copy: + values = np.asarray(values) + else: + values = np.array(values, copy=copy) inferred_type = None if values.dtype == object or is_string_dtype(values.dtype): inferred_type = lib.infer_dtype(values, skipna=True) @@ -168,7 +171,10 @@ def _coerce_to_data_and_mask( raise TypeError(f"{values.dtype} cannot be converted to {name}") elif values.dtype.kind == "b" and checker(dtype): - values = np.array(values, dtype=default_dtype, copy=copy) + if not copy: + values = np.asarray(values, dtype=default_dtype) + else: + values = np.array(values, dtype=default_dtype, copy=copy) elif values.dtype.kind not in "iuf": name = dtype_cls.__name__.strip("_") @@ -207,9 +213,9 @@ def _coerce_to_data_and_mask( inferred_type not in ["floating", "mixed-integer-float"] and not mask.any() ): - values = np.array(original, dtype=dtype, copy=False) + values = np.asarray(original, dtype=dtype) else: - values = np.array(original, dtype="object", copy=False) + values = np.asarray(original, dtype="object") # we copy as need to coerce here if mask.any(): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index d83a37088daec..07eb91e0cb13b 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -150,7 +150,9 @@ def dtype(self) -> NumpyEADtype: # ------------------------------------------------------------------------ # NumPy Array Interface - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index d635eb4a25df3..c1229e27ab51a 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -256,7 +256,10 @@ def __init__( raise raise_on_incompatible(values, dtype.freq) values, dtype = values._ndarray, values.dtype - values = np.array(values, dtype="int64", copy=copy) + if not copy: + values = np.asarray(values, dtype="int64") + else: + values = np.array(values, dtype="int64", copy=copy) if dtype is None: raise ValueError("dtype is not specified and cannot be inferred") dtype = cast(PeriodDtype, dtype) @@ -400,7 +403,9 @@ def freq(self) -> BaseOffset: def freqstr(self) -> str: return freq_to_period_freqstr(self.freq.n, self.freq.name) - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: if dtype == "i8": return self.asi8 elif dtype == bool: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 98d84d899094b..82fcfa74ec7d2 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -551,7 +551,9 @@ def from_spmatrix(cls, data: spmatrix) -> Self: return cls._simple_new(arr, index, dtype) - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: fill_value = self.fill_value if self.sp_index.ngaps == 0: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 1b885a2bdcd47..e9260a3ec50a2 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -1072,7 +1072,10 @@ def sequence_to_td64ns( # This includes datetime64-dtype, see GH#23539, GH#29794 raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]") - data = np.array(data, copy=copy) + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) assert data.dtype.kind == "m" assert data.dtype != "m8" # i.e. not unit-less @@ -1150,7 +1153,7 @@ def _objects_to_td64ns(data, unit=None, errors: DateTimeErrorChoices = "raise"): higher level. """ # coerce Index to np.ndarray, converting string-dtype if necessary - values = np.array(data, dtype=np.object_, copy=False) + values = np.asarray(data, dtype=np.object_) result = array_to_timedelta64(values, unit=unit, errors=errors) return result.view("timedelta64[ns]") diff --git a/pandas/core/construction.py b/pandas/core/construction.py index d41a9c80a10ec..f8250ae475a10 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -626,7 +626,10 @@ def sanitize_array( elif hasattr(data, "__array__"): # e.g. dask array GH#38645 - data = np.array(data, copy=copy) + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) return sanitize_array( data, index=index, @@ -744,8 +747,11 @@ def _sanitize_str_dtypes( # GH#19853: If data is a scalar, result has already the result if not lib.is_scalar(data): if not np.all(isna(data)): - data = np.array(data, dtype=dtype, copy=False) - result = np.array(data, dtype=object, copy=copy) + data = np.asarray(data, dtype=dtype) + if not copy: + result = np.asarray(data, dtype=object) + else: + result = np.array(data, dtype=object, copy=copy) return result @@ -810,6 +816,8 @@ def _try_cast( # this will raise if we have e.g. floats subarr = maybe_cast_to_integer_array(arr, dtype) + elif not copy: + subarr = np.asarray(arr, dtype=dtype) else: subarr = np.array(arr, dtype=dtype, copy=copy) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 690af6b0ebdc7..7dd81ec59bc49 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1501,7 +1501,10 @@ def construct_2d_arraylike_from_scalar( # Attempt to coerce to a numpy array try: - arr = np.array(value, dtype=dtype, copy=copy) + if not copy: + arr = np.asarray(value, dtype=dtype) + else: + arr = np.array(value, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: raise TypeError( f"DataFrame constructor called with incompatible data and dtype: {err}" @@ -1651,7 +1654,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n "out-of-bound Python int", DeprecationWarning, ) - casted = np.array(arr, dtype=dtype, copy=False) + casted = np.asarray(arr, dtype=dtype) else: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 655a53997620a..c341ff9dff7e6 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -632,7 +632,7 @@ def infer_fill_value(val): """ if not is_list_like(val): val = [val] - val = np.array(val, copy=False) + val = np.asarray(val) if val.dtype.kind in "mM": return np.array("NaT", dtype=val.dtype) elif val.dtype == object: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c09989fe87fb0..5c510d98596df 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1980,7 +1980,7 @@ def to_numpy( dtype = np.dtype(dtype) result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) if result.dtype is not dtype: - result = np.array(result, dtype=dtype, copy=False) + result = np.asarray(result, dtype=dtype) return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1dd471a09f1b1..2a86f75badecd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2145,7 +2145,9 @@ def empty(self) -> bool_t: # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented __array_priority__: int = 1000 - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None + ) -> np.ndarray: values = self._values arr = np.asarray(values, dtype=dtype) if ( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4b3d1a9e006dc..6822c2c99427e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -912,7 +912,7 @@ def __len__(self) -> int: """ return len(self._data) - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: """ The array interface, return my values. """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 02a841a2075fd..091ddbcc099be 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -774,7 +774,7 @@ def _values(self) -> np.ndarray: ): vals = vals.astype(object) - vals = np.array(vals, copy=False) + vals = np.asarray(vals) vals = algos.take_nd(vals, codes, fill_value=index._na_value) values.append(vals) @@ -1309,7 +1309,7 @@ def copy( # type: ignore[override] new_index._id = self._id return new_index - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" return self.values @@ -3397,7 +3397,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): locs = (level_codes >= idx.start) & (level_codes < idx.stop) return locs - locs = np.array(level_codes == idx, dtype=bool, copy=False) + locs = np.asarray(level_codes == idx, dtype=bool) if not locs.any(): # The label is present in self.levels[level] but unused: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 220bb1133dfd5..2e0e04717373f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1682,6 +1682,8 @@ def as_array( na_value=na_value, copy=copy, ).reshape(blk.shape) + elif not copy: + arr = np.asarray(blk.values, dtype=dtype) else: arr = np.array(blk.values, dtype=dtype, copy=copy) diff --git a/pandas/core/series.py b/pandas/core/series.py index c6a905cbb6ec1..236085c2a62e1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -971,7 +971,9 @@ def view(self, dtype: Dtype | None = None) -> Series: # ---------------------------------------------------------------------- # NDArray Compat - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool | None = None + ) -> np.ndarray: """ Return the values as a NumPy array. @@ -984,6 +986,9 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: The dtype to use for the resulting NumPy array. By default, the dtype is inferred from the data. + copy : bool or None, optional + Unused. + Returns ------- numpy.ndarray diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1139519d2bcd3..13c2f10785124 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4065,7 +4065,7 @@ def _create_axes( if isinstance(data_converted.dtype, CategoricalDtype): ordered = data_converted.ordered meta = "category" - metadata = np.array(data_converted.categories, copy=False).ravel() + metadata = np.asarray(data_converted.categories).ravel() data, dtype_name = _get_data_and_dtype_name(data_converted) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index d979dd445a61a..8acd298f37a07 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -197,6 +197,7 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "Addition/subtraction of integers and integer-arrays with Timestamp", "has no kernel", "not implemented", + "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.", ] ) with pytest.raises(errs, match=msg): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 82524ea115019..7f85c891afeed 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -12,6 +12,7 @@ Timestamp, ) from pandas._libs.tslibs.dtypes import freq_to_period_freqstr +from pandas.compat.numpy import np_version_gt2 import pandas as pd from pandas import ( @@ -638,13 +639,14 @@ def test_round(self, arr1d): def test_array_interface(self, datetime_index): arr = datetime_index._data + copy_false = None if np_version_gt2 else False # default asarray gives the same underlying data (for tz naive) result = np.asarray(arr) expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, copy=False) + result = np.array(arr, copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) @@ -653,7 +655,7 @@ def test_array_interface(self, datetime_index): expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype="datetime64[ns]", copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="datetime64[ns]") @@ -696,6 +698,7 @@ def test_array_tz(self, arr1d): # GH#23524 arr = arr1d dti = self.index_cls(arr1d) + copy_false = None if np_version_gt2 else False expected = dti.asi8.view("M8[ns]") result = np.array(arr, dtype="M8[ns]") @@ -704,17 +707,18 @@ def test_array_tz(self, arr1d): result = np.array(arr, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) - # check that we are not making copies when setting copy=False - result = np.array(arr, dtype="M8[ns]", copy=False) + # check that we are not making copies when setting copy=copy_false + result = np.array(arr, dtype="M8[ns]", copy=copy_false) assert result.base is expected.base assert result.base is not None - result = np.array(arr, dtype="datetime64[ns]", copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=copy_false) assert result.base is expected.base assert result.base is not None def test_array_i8_dtype(self, arr1d): arr = arr1d dti = self.index_cls(arr1d) + copy_false = None if np_version_gt2 else False expected = dti.asi8 result = np.array(arr, dtype="i8") @@ -723,8 +727,8 @@ def test_array_i8_dtype(self, arr1d): result = np.array(arr, dtype=np.int64) tm.assert_numpy_array_equal(result, expected) - # check that we are still making copies when setting copy=False - result = np.array(arr, dtype="i8", copy=False) + # check that we are still making copies when setting copy=copy_false + result = np.array(arr, dtype="i8", copy=copy_false) assert result.base is not expected.base assert result.base is None @@ -950,13 +954,14 @@ def test_int_properties(self, timedelta_index, propname): def test_array_interface(self, timedelta_index): arr = timedelta_index._data + copy_false = None if np_version_gt2 else False # default asarray gives the same underlying data result = np.asarray(arr) expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, copy=False) + result = np.array(arr, copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) @@ -965,7 +970,7 @@ def test_array_interface(self, timedelta_index): expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype="timedelta64[ns]", copy=False) + result = np.array(arr, dtype="timedelta64[ns]", copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="timedelta64[ns]") diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 49eb06c299886..0567be737c681 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -112,8 +112,8 @@ def it_outer(): def __len__(self) -> int: return len(self._values) - def __array__(self, t=None): - return np.asarray(self._values, dtype=t) + def __array__(self, dtype=None, copy=None): + return np.asarray(self._values, dtype=dtype) @property def ndim(self): diff --git a/pandas/tests/extension/array_with_attr/array.py b/pandas/tests/extension/array_with_attr/array.py index d0249d9af8098..2789d51ec2ce3 100644 --- a/pandas/tests/extension/array_with_attr/array.py +++ b/pandas/tests/extension/array_with_attr/array.py @@ -49,7 +49,10 @@ def __init__(self, values, attr=None) -> None: @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): - data = np.array(scalars, dtype="float64", copy=copy) + if not copy: + data = np.asarray(scalars, dtype="float64") + else: + data = np.array(scalars, dtype="float64", copy=copy) return cls(data) def __getitem__(self, item): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 31f44f886add7..e43b50322bb92 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -146,7 +146,7 @@ def __eq__(self, other): def __ne__(self, other): return NotImplemented - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): if dtype is None: dtype = object if dtype == object: @@ -210,8 +210,10 @@ def astype(self, dtype, copy=True): value = self.astype(str) # numpy doesn't like nested dicts arr_cls = dtype.construct_array_type() return arr_cls._from_sequence(value, dtype=dtype, copy=False) - - return np.array([dict(x) for x in self], dtype=dtype, copy=copy) + elif not copy: + return np.asarray([dict(x) for x in self], dtype=dtype) + else: + return np.array([dict(x) for x in self], dtype=dtype, copy=copy) def unique(self): # Parent method doesn't work since np.array will try to infer diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py index f07585c0aec10..b3bb35c9396f4 100644 --- a/pandas/tests/extension/list/array.py +++ b/pandas/tests/extension/list/array.py @@ -115,7 +115,10 @@ def astype(self, dtype, copy=True): elif is_string_dtype(dtype) and not is_object_dtype(dtype): # numpy has problems with astype(str) for nested elements return np.array([str(x) for x in self.data], dtype=dtype) - return np.array(self.data, dtype=dtype, copy=copy) + elif not copy: + return np.asarray(self.data, dtype=dtype) + else: + return np.array(self.data, dtype=dtype, copy=copy) @classmethod def _concat_same_type(cls, to_concat): diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 3d8523f344d46..5eda0f00f54ca 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -17,7 +17,7 @@ class DummyArray(ExtensionArray): def __init__(self, data) -> None: self.data = data - def __array__(self, dtype): + def __array__(self, dtype=None, copy=None): return self.data @property @@ -30,8 +30,10 @@ def astype(self, dtype, copy=True): if copy: return type(self)(self.data) return self - - return np.array(self, dtype=dtype, copy=copy) + elif not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype, copy=copy) class TestExtensionArrayDtype: diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 47c479faed1ef..d1bee6a3de613 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -32,7 +32,7 @@ def __init__(self, data, dtype) -> None: self.data = data self._dtype = dtype - def __array__(self, dtype): + def __array__(self, dtype=None, copy=None): return self.data @property diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 42ce658701355..0593de7556406 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -59,7 +59,7 @@ def __init__(self, value, dtype) -> None: self.value = value self.dtype = np.dtype(dtype) - def __array__(self): + def __array__(self, dtype=None, copy=None): return np.array(self.value, dtype=self.dtype) def __str__(self) -> str: diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 72641077c90fe..6042e5b9cc679 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -413,7 +413,7 @@ class ArrayLike: def __init__(self, array) -> None: self.array = array - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: return self.array expected = Index(array) From 63b9eba683019ef1b3ce3b66fe71dc7712f2d4aa Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 7 Mar 2024 18:13:44 +0100 Subject: [PATCH 128/396] Backport PR #57759 on branch 2.2.x (DOC: add whatsnew for v2.2.2) (#57763) * Backport PR #57759: DOC: add whatsnew for v2.2.2 * [skip-ci] --------- Co-authored-by: Marco Edward Gorelli Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.2.2.rst | 36 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 doc/source/whatsnew/v2.2.2.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 3a2ab4c17d1bd..34a2845290d5a 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.2 .. toctree:: :maxdepth: 2 + v2.2.2 v2.2.1 v2.2.0 diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst new file mode 100644 index 0000000000000..058f7aebcd538 --- /dev/null +++ b/doc/source/whatsnew/v2.2.2.rst @@ -0,0 +1,36 @@ +.. _whatsnew_222: + +What's new in 2.2.2 (April XX, 2024) +--------------------------------------- + +These are the changes in pandas 2.2.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.contributors: + +Contributors +~~~~~~~~~~~~ From e44f91d70ef02081bebec8137b59db1c8cc2161d Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 8 Mar 2024 07:02:16 +0000 Subject: [PATCH 129/396] Backport PR #57665 on branch 2.2.x (BUG: interchange protocol with nullable datatypes a non-null validity) (#57769) BUG: interchange protocol with nullable datatypes a non-null validity (#57665) * BUG: interchange protocol with nullable datatypes a non-null validity provides nonsense results * whatsnew * :label: typing * parametrise over more types * move whatsnew (cherry picked from commit 03717bcc5ae762d8a0ab8d259ca000af66e8ba82) --- doc/source/whatsnew/v2.2.2.rst | 1 + pandas/core/interchange/column.py | 18 ++++++++++- pandas/tests/interchange/test_impl.py | 44 +++++++++++++++++++++++---- 3 files changed, 56 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 058f7aebcd538..96f210ce6b7b9 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index e273ecad8b51e..7b39403ca1916 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -190,6 +190,10 @@ def describe_categorical(self): @property def describe_null(self): + if isinstance(self._col.dtype, BaseMaskedDtype): + column_null_dtype = ColumnNullType.USE_BYTEMASK + null_value = 1 + return column_null_dtype, null_value kind = self.dtype[0] try: null, value = _NULL_DESCRIPTION[kind] @@ -290,7 +294,13 @@ def _get_data_buffer( if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4: np_arr = self._col.dt.tz_convert(None).to_numpy() else: - np_arr = self._col.to_numpy() + arr = self._col.array + if isinstance(self._col.dtype, BaseMaskedDtype): + np_arr = arr._data # type: ignore[attr-defined] + elif isinstance(self._col.dtype, ArrowDtype): + raise NotImplementedError("ArrowDtype not handled yet") + else: + np_arr = arr._ndarray # type: ignore[attr-defined] buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: @@ -328,6 +338,12 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: """ null, invalid = self.describe_null + if isinstance(self._col.dtype, BaseMaskedDtype): + mask = self._col.array._mask # type: ignore[attr-defined] + buffer = PandasBuffer(mask) + dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) + return buffer, dtype + if self.dtype[0] == DtypeKind.STRING: # For now, use byte array as the mask. # TODO: maybe store as bit array to save space?.. diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index d47b533f92235..a1dedb6be456c 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -9,7 +9,6 @@ is_platform_windows, ) from pandas.compat.numpy import np_version_lt1p23 -import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -404,17 +403,50 @@ def test_non_str_names_w_duplicates(): pd.api.interchange.from_dataframe(dfi, allow_copy=False) -@pytest.mark.parametrize( - "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))] -) -def test_nullable_integers(dtype: str) -> None: +def test_nullable_integers() -> None: + # https://github.com/pandas-dev/pandas/issues/55069 + df = pd.DataFrame({"a": [1]}, dtype="Int8") + expected = pd.DataFrame({"a": [1]}, dtype="int8") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/57664") +def test_nullable_integers_pyarrow() -> None: # https://github.com/pandas-dev/pandas/issues/55069 - df = pd.DataFrame({"a": [1]}, dtype=dtype) + df = pd.DataFrame({"a": [1]}, dtype="Int8[pyarrow]") expected = pd.DataFrame({"a": [1]}, dtype="int8") result = pd.api.interchange.from_dataframe(df.__dataframe__()) tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, None], "Int64", "int64"), + ( + [1, 2, None], + "UInt64", + "uint64", + ), + ([1.0, 2.25, None], "Float32", "float32"), + ], +) +def test_pandas_nullable_w_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + df = pd.DataFrame({"a": data}, dtype=dtype) + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] + assert result[2].as_py() is None + + def test_empty_dataframe(): # https://github.com/pandas-dev/pandas/issues/56700 df = pd.DataFrame({"a": []}, dtype="int8") From d600189edeb0768477d929b667f57d0f6566eb23 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 8 Mar 2024 22:35:59 +0100 Subject: [PATCH 130/396] Backport PR #57780 on branch 2.2.x (COMPAT: Adapt to Numpy 2.0 dtype changes) (#57784) Backport PR #57780: COMPAT: Adapt to Numpy 2.0 dtype changes Co-authored-by: Sebastian Berg --- pandas/_libs/src/datetime/pd_datetime.c | 4 ++++ .../_libs/src/vendored/numpy/datetime/np_datetime.c | 12 ++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c index 19de51be6e1b2..4c1969f6d9f57 100644 --- a/pandas/_libs/src/datetime/pd_datetime.c +++ b/pandas/_libs/src/datetime/pd_datetime.c @@ -20,6 +20,9 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include "datetime.h" +/* Need to import_array for np_datetime.c (for NumPy 1.x support only) */ +#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY +#include "numpy/ndarrayobject.h" #include "pandas/datetime/pd_datetime.h" #include "pandas/portable.h" @@ -255,5 +258,6 @@ static struct PyModuleDef pandas_datetimemodule = { PyMODINIT_FUNC PyInit_pandas_datetime(void) { PyDateTime_IMPORT; + import_array(); return PyModuleDef_Init(&pandas_datetimemodule); } diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 277d01807f2f3..934c54fafb634 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -16,8 +16,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt // Licence at LICENSES/NUMPY_LICENSE -#define NO_IMPORT - #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #endif // NPY_NO_DEPRECATED_API @@ -25,7 +23,10 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include "pandas/vendored/numpy/datetime/np_datetime.h" -#include + +#define NO_IMPORT_ARRAY +#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY +#include #include #if defined(_WIN32) @@ -1070,5 +1071,8 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, */ PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(PyArray_Descr *dtype) { - return (((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta); +#if NPY_ABI_VERSION < 0x02000000 +#define PyDataType_C_METADATA(dtype) ((dtype)->c_metadata) +#endif + return ((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(dtype))->meta; } From 33006cd14d5febd30176b0003be668770c0e1671 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 12 Mar 2024 22:15:24 +0100 Subject: [PATCH 131/396] Backport PR #57821 on branch 2.2.x (Fix doc build) (#57822) Backport PR #57821: Fix doc build Co-authored-by: Trinh Quoc Anh --- environment.yml | 1 + requirements-dev.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 58eb69ad1f070..9cec788ca600b 100644 --- a/environment.yml +++ b/environment.yml @@ -62,6 +62,7 @@ dependencies: # downstream packages - dask-core - seaborn-base + - dask-expr # local testing dependencies - moto diff --git a/requirements-dev.txt b/requirements-dev.txt index 5a63e59e1db88..2e7d06ea1c12d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -49,6 +49,7 @@ xlsxwriter>=3.0.5 zstandard>=0.19.0 dask seaborn +dask-expr moto flask asv>=0.6.1 From 9ed53829a2e2f2115c7805c3b255f3a5ef53e390 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 13 Mar 2024 20:30:32 +0100 Subject: [PATCH 132/396] Backport PR #57830 on branch 2.2.x (DOC: Pin dask/dask-expr for scale.rst) (#57832) Backport PR #57830: DOC: Pin dask/dask-expr for scale.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- environment.yml | 4 ++-- requirements-dev.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index 9cec788ca600b..7db2767661d62 100644 --- a/environment.yml +++ b/environment.yml @@ -60,9 +60,9 @@ dependencies: - zstandard>=0.19.0 # downstream packages - - dask-core + - dask-core<=2024.2.1 - seaborn-base - - dask-expr + - dask-expr<=0.5.3 # local testing dependencies - moto diff --git a/requirements-dev.txt b/requirements-dev.txt index 2e7d06ea1c12d..68e386edb0c31 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -47,9 +47,9 @@ xarray>=2022.12.0 xlrd>=2.0.1 xlsxwriter>=3.0.5 zstandard>=0.19.0 -dask +dask<=2024.2.1 seaborn -dask-expr +dask-expr<=0.5.3 moto flask asv>=0.6.1 From 4fdbe560664b12d26e68921a7d1f015d1f5030cd Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 14 Mar 2024 01:39:40 +0100 Subject: [PATCH 133/396] Backport PR #57796 on branch 2.2.x (Fix issue with Tempita recompilation) (#57834) Backport PR #57796: Fix issue with Tempita recompilation Co-authored-by: William Ayd --- pandas/_libs/meson.build | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index c27386743c6e9..7621915ebcfdb 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -54,25 +54,37 @@ _intervaltree_helper = custom_target('intervaltree_helper_pxi', py, tempita, '@INPUT@', '-o', '@OUTDIR@' ] ) -_khash_primitive_helper_dep = declare_dependency(sources: _khash_primitive_helper) + +_algos_pxi_dep = declare_dependency(sources: [_algos_take_helper, _algos_common_helper]) +_khash_pxi_dep = declare_dependency(sources: _khash_primitive_helper) +_hashtable_pxi_dep = declare_dependency( + sources: [_hashtable_class_helper, _hashtable_func_helper] +) +_index_pxi_dep = declare_dependency(sources: _index_class_helper) +_intervaltree_pxi_dep = declare_dependency(sources: _intervaltree_helper) +_sparse_pxi_dep = declare_dependency(sources: _sparse_op_helper) + subdir('tslibs') libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included - 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep}, + 'algos': {'sources': ['algos.pyx'], + 'deps': [_khash_pxi_dep, _algos_pxi_dep]}, 'arrays': {'sources': ['arrays.pyx']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, - 'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep}, - 'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep}, + 'hashtable': {'sources': ['hashtable.pyx'], + 'deps': [_khash_pxi_dep, _hashtable_pxi_dep]}, + 'index': {'sources': ['index.pyx'], + 'deps': [_khash_pxi_dep, _index_pxi_dep]}, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, - 'interval': {'sources': ['interval.pyx', _intervaltree_helper], - 'deps': _khash_primitive_helper_dep}, - 'join': {'sources': ['join.pyx', _khash_primitive_helper], - 'deps': _khash_primitive_helper_dep}, + 'interval': {'sources': ['interval.pyx'], + 'deps': [_khash_pxi_dep, _intervaltree_pxi_dep]}, + 'join': {'sources': ['join.pyx'], + 'deps': [_khash_pxi_dep]}, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, 'missing': {'sources': ['missing.pyx']}, 'pandas_datetime': {'sources': ['src/vendored/numpy/datetime/np_datetime.c', @@ -83,7 +95,7 @@ libs_sources = { 'src/parser/io.c', 'src/parser/pd_parser.c']}, 'parsers': {'sources': ['parsers.pyx', 'src/parser/tokenizer.c', 'src/parser/io.c'], - 'deps': _khash_primitive_helper_dep}, + 'deps': [_khash_pxi_dep]}, 'json': {'sources': ['src/vendored/ujson/python/ujson.c', 'src/vendored/ujson/python/objToJSON.c', 'src/vendored/ujson/python/JSONtoObj.c', @@ -95,7 +107,8 @@ libs_sources = { 'reshape': {'sources': ['reshape.pyx']}, 'sas': {'sources': ['sas.pyx']}, 'byteswap': {'sources': ['byteswap.pyx']}, - 'sparse': {'sources': ['sparse.pyx', _sparse_op_helper]}, + 'sparse': {'sources': ['sparse.pyx'], + 'deps': [_sparse_pxi_dep]}, 'tslib': {'sources': ['tslib.pyx']}, 'testing': {'sources': ['testing.pyx']}, 'writers': {'sources': ['writers.pyx']} From b6488afd3bdcca877683ea2efe4183a0d8ea84d8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 15 Mar 2024 19:16:46 +0100 Subject: [PATCH 134/396] Backport PR #57848 on branch 2.2.x (DOC: Remove duplicated Series.dt.normalize from docs) (#57854) Backport PR #57848: DOC: Remove duplicated Series.dt.normalize from docs Co-authored-by: Marc Garcia --- doc/source/reference/series.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index a4ea0ec396ceb..d40f6e559b8fa 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -342,7 +342,6 @@ Datetime properties Series.dt.tz Series.dt.freq Series.dt.unit - Series.dt.normalize Datetime methods ^^^^^^^^^^^^^^^^ From 962e23398ea07a610af238dd8e6188479a46cbb9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Mar 2024 12:43:38 -1000 Subject: [PATCH 135/396] Backport PR #57843: DOC: Remove Dask and Modin sections in scale.rst in favor of linking to ecosystem docs. (#57861) Co-authored-by: Yuki Kitayama <47092819+yukikitayama@users.noreply.github.com> --- doc/source/user_guide/scale.rst | 164 ++------------------------------ environment.yml | 3 +- requirements-dev.txt | 3 +- 3 files changed, 9 insertions(+), 161 deletions(-) diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index b262de5d71439..29df2994fbc35 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -156,7 +156,7 @@ fits in memory, you can work with datasets that are much larger than memory. Chunking works well when the operation you're performing requires zero or minimal coordination between chunks. For more complicated workflows, you're better off - :ref:`using another library `. + :ref:`using other libraries `. Suppose we have an even larger "logical dataset" on disk that's a directory of parquet files. Each file in the directory represents a different year of the entire dataset. @@ -219,160 +219,10 @@ different library that implements these out-of-core algorithms for you. .. _scale.other_libraries: -Use Dask --------- +Use Other Libraries +------------------- -pandas is just one library offering a DataFrame API. Because of its popularity, -pandas' API has become something of a standard that other libraries implement. -The pandas documentation maintains a list of libraries implementing a DataFrame API -in `the ecosystem page `_. - -For example, `Dask`_, a parallel computing library, has `dask.dataframe`_, a -pandas-like API for working with larger than memory datasets in parallel. Dask -can use multiple threads or processes on a single machine, or a cluster of -machines to process data in parallel. - - -We'll import ``dask.dataframe`` and notice that the API feels similar to pandas. -We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in. - -.. ipython:: python - :okwarning: - - import dask.dataframe as dd - - ddf = dd.read_parquet("data/timeseries/ts*.parquet", engine="pyarrow") - ddf - -Inspecting the ``ddf`` object, we see a few things - -* There are familiar attributes like ``.columns`` and ``.dtypes`` -* There are familiar methods like ``.groupby``, ``.sum``, etc. -* There are new attributes like ``.npartitions`` and ``.divisions`` - -The partitions and divisions are how Dask parallelizes computation. A **Dask** -DataFrame is made up of many pandas :class:`pandas.DataFrame`. A single method call on a -Dask DataFrame ends up making many pandas method calls, and Dask knows how to -coordinate everything to get the result. - -.. ipython:: python - - ddf.columns - ddf.dtypes - ddf.npartitions - -One major difference: the ``dask.dataframe`` API is *lazy*. If you look at the -repr above, you'll notice that the values aren't actually printed out; just the -column names and dtypes. That's because Dask hasn't actually read the data yet. -Rather than executing immediately, doing operations build up a **task graph**. - -.. ipython:: python - :okwarning: - - ddf - ddf["name"] - ddf["name"].value_counts() - -Each of these calls is instant because the result isn't being computed yet. -We're just building up a list of computation to do when someone needs the -result. Dask knows that the return type of a :class:`pandas.Series.value_counts` -is a pandas :class:`pandas.Series` with a certain dtype and a certain name. So the Dask version -returns a Dask Series with the same dtype and the same name. - -To get the actual result you can call ``.compute()``. - -.. ipython:: python - :okwarning: - - %time ddf["name"].value_counts().compute() - -At that point, you get back the same thing you'd get with pandas, in this case -a concrete pandas :class:`pandas.Series` with the count of each ``name``. - -Calling ``.compute`` causes the full task graph to be executed. This includes -reading the data, selecting the columns, and doing the ``value_counts``. The -execution is done *in parallel* where possible, and Dask tries to keep the -overall memory footprint small. You can work with datasets that are much larger -than memory, as long as each partition (a regular pandas :class:`pandas.DataFrame`) fits in memory. - -By default, ``dask.dataframe`` operations use a threadpool to do operations in -parallel. We can also connect to a cluster to distribute the work on many -machines. In this case we'll connect to a local "cluster" made up of several -processes on this single machine. - -.. code-block:: python - - >>> from dask.distributed import Client, LocalCluster - - >>> cluster = LocalCluster() - >>> client = Client(cluster) - >>> client - - -Once this ``client`` is created, all of Dask's computation will take place on -the cluster (which is just processes in this case). - -Dask implements the most used parts of the pandas API. For example, we can do -a familiar groupby aggregation. - -.. ipython:: python - :okwarning: - - %time ddf.groupby("name")[["x", "y"]].mean().compute().head() - -The grouping and aggregation is done out-of-core and in parallel. - -When Dask knows the ``divisions`` of a dataset, certain optimizations are -possible. When reading parquet datasets written by dask, the divisions will be -known automatically. In this case, since we created the parquet files manually, -we need to supply the divisions manually. - -.. ipython:: python - :okwarning: - - N = 12 - starts = [f"20{i:>02d}-01-01" for i in range(N)] - ends = [f"20{i:>02d}-12-13" for i in range(N)] - - divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),) - ddf.divisions = divisions - ddf - -Now we can do things like fast random access with ``.loc``. - -.. ipython:: python - :okwarning: - - ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute() - -Dask knows to just look in the 3rd partition for selecting values in 2002. It -doesn't need to look at any other data. - -Many workflows involve a large amount of data and processing it in a way that -reduces the size to something that fits in memory. In this case, we'll resample -to daily frequency and take the mean. Once we've taken the mean, we know the -results will fit in memory, so we can safely call ``compute`` without running -out of memory. At that point it's just a regular pandas object. - -.. ipython:: python - :okwarning: - - @savefig dask_resample.png - ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot() - -.. ipython:: python - :suppress: - - import shutil - - shutil.rmtree("data/timeseries") - -These Dask examples have all be done using multiple processes on a single -machine. Dask can be `deployed on a cluster -`_ to scale up to even larger -datasets. - -You see more dask examples at https://examples.dask.org. - -.. _Dask: https://dask.org -.. _dask.dataframe: https://docs.dask.org/en/latest/dataframe.html +There are other libraries which provide similar APIs to pandas and work nicely with pandas DataFrame, +and can give you the ability to scale your large dataset processing and analytics +by parallel runtime, distributed memory, clustering, etc. You can find more information +in `the ecosystem page `_. diff --git a/environment.yml b/environment.yml index 7db2767661d62..58eb69ad1f070 100644 --- a/environment.yml +++ b/environment.yml @@ -60,9 +60,8 @@ dependencies: - zstandard>=0.19.0 # downstream packages - - dask-core<=2024.2.1 + - dask-core - seaborn-base - - dask-expr<=0.5.3 # local testing dependencies - moto diff --git a/requirements-dev.txt b/requirements-dev.txt index 68e386edb0c31..5a63e59e1db88 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -47,9 +47,8 @@ xarray>=2022.12.0 xlrd>=2.0.1 xlsxwriter>=3.0.5 zstandard>=0.19.0 -dask<=2024.2.1 +dask seaborn -dask-expr<=0.5.3 moto flask asv>=0.6.1 From cd6eeae3d3501ee316bbcd135ed7cbe969e2c856 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:43:02 +0100 Subject: [PATCH 136/396] Backport PR #57883 on branch 2.2.x (Bump pypa/cibuildwheel from 2.16.5 to 2.17.0) (#57888) Backport PR #57883: Bump pypa/cibuildwheel from 2.16.5 to 2.17.0 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index f79b2c51b5f92..470c044d2e99e 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -141,7 +141,7 @@ jobs: - name: Build normal wheels if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} - uses: pypa/cibuildwheel@v2.16.5 + uses: pypa/cibuildwheel@v2.17.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: @@ -150,7 +150,7 @@ jobs: - name: Build nightly wheels (with NumPy pre-release) if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.16.5 + uses: pypa/cibuildwheel@v2.17.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 71a6797dfa55b8d937abc226462b322e32feb4da Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 18 Mar 2024 21:36:47 +0100 Subject: [PATCH 137/396] Backport PR #57892 on branch 2.2.x (CI: xfail Pyarrow slicing test) (#57898) Backport PR #57892: CI: xfail Pyarrow slicing test Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/compat/__init__.py | 2 ++ pandas/compat/pyarrow.py | 2 ++ pandas/tests/indexes/object/test_indexing.py | 12 +++++++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 738442fab8c70..eb890c8b8c0ab 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -30,6 +30,7 @@ pa_version_under13p0, pa_version_under14p0, pa_version_under14p1, + pa_version_under16p0, ) if TYPE_CHECKING: @@ -186,6 +187,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under13p0", "pa_version_under14p0", "pa_version_under14p1", + "pa_version_under16p0", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index beb4814914101..a2dfa69bbf236 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -15,6 +15,7 @@ pa_version_under14p0 = _palv < Version("14.0.0") pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") + pa_version_under16p0 = _palv < Version("16.0.0") except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -23,3 +24,4 @@ pa_version_under14p0 = True pa_version_under14p1 = True pa_version_under15p0 = True + pa_version_under16p0 = True diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index ebf9dac715f8d..443cacf94d239 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -7,6 +7,7 @@ NA, is_matching_na, ) +from pandas.compat import pa_version_under16p0 import pandas.util._test_decorators as td import pandas as pd @@ -200,7 +201,16 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype): + def test_slice_locs_negative_step(self, in_slice, expected, dtype, request): + if ( + not pa_version_under16p0 + and dtype == "string[pyarrow_numpy]" + and in_slice == slice("a", "a", -1) + ): + request.applymarker( + pytest.mark.xfail(reason="https://github.com/apache/arrow/issues/40642") + ) + index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) From cc5632159fb9f24586be29b80223e910c566a6e1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 18 Mar 2024 23:01:56 +0100 Subject: [PATCH 138/396] Backport PR #57889 on branch 2.2.x (BUG: Handle Series construction with Dask, dict-like, Series) (#57899) Backport PR #57889: BUG: Handle Series construction with Dask, dict-like, Series Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 236085c2a62e1..c1782206d4b67 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -533,7 +533,7 @@ def __init__( data = data.reindex(index, copy=copy) copy = False data = data._mgr - elif is_dict_like(data): + elif isinstance(data, Mapping): data, index = self._init_dict(data, index, dtype) dtype = None copy = False @@ -605,7 +605,7 @@ def __init__( ) def _init_dict( - self, data, index: Index | None = None, dtype: DtypeObj | None = None + self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None ): """ Derive the "_mgr" and "index" attributes of a new Series from a From 83497f53dee16f447f2df29af60ff5521c4f1177 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 19 Mar 2024 01:55:34 +0100 Subject: [PATCH 139/396] Backport PR #57905 on branch 2.2.x (Revert "Fix issue with Tempita recompilation (#57796)") (#57907) Backport PR #57905: Revert "Fix issue with Tempita recompilation (#57796)" Co-authored-by: William Ayd --- pandas/_libs/meson.build | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index 7621915ebcfdb..c27386743c6e9 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -54,37 +54,25 @@ _intervaltree_helper = custom_target('intervaltree_helper_pxi', py, tempita, '@INPUT@', '-o', '@OUTDIR@' ] ) - -_algos_pxi_dep = declare_dependency(sources: [_algos_take_helper, _algos_common_helper]) -_khash_pxi_dep = declare_dependency(sources: _khash_primitive_helper) -_hashtable_pxi_dep = declare_dependency( - sources: [_hashtable_class_helper, _hashtable_func_helper] -) -_index_pxi_dep = declare_dependency(sources: _index_class_helper) -_intervaltree_pxi_dep = declare_dependency(sources: _intervaltree_helper) -_sparse_pxi_dep = declare_dependency(sources: _sparse_op_helper) - +_khash_primitive_helper_dep = declare_dependency(sources: _khash_primitive_helper) subdir('tslibs') libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included - 'algos': {'sources': ['algos.pyx'], - 'deps': [_khash_pxi_dep, _algos_pxi_dep]}, + 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep}, 'arrays': {'sources': ['arrays.pyx']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, - 'hashtable': {'sources': ['hashtable.pyx'], - 'deps': [_khash_pxi_dep, _hashtable_pxi_dep]}, - 'index': {'sources': ['index.pyx'], - 'deps': [_khash_pxi_dep, _index_pxi_dep]}, + 'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep}, + 'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep}, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, - 'interval': {'sources': ['interval.pyx'], - 'deps': [_khash_pxi_dep, _intervaltree_pxi_dep]}, - 'join': {'sources': ['join.pyx'], - 'deps': [_khash_pxi_dep]}, + 'interval': {'sources': ['interval.pyx', _intervaltree_helper], + 'deps': _khash_primitive_helper_dep}, + 'join': {'sources': ['join.pyx', _khash_primitive_helper], + 'deps': _khash_primitive_helper_dep}, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, 'missing': {'sources': ['missing.pyx']}, 'pandas_datetime': {'sources': ['src/vendored/numpy/datetime/np_datetime.c', @@ -95,7 +83,7 @@ libs_sources = { 'src/parser/io.c', 'src/parser/pd_parser.c']}, 'parsers': {'sources': ['parsers.pyx', 'src/parser/tokenizer.c', 'src/parser/io.c'], - 'deps': [_khash_pxi_dep]}, + 'deps': _khash_primitive_helper_dep}, 'json': {'sources': ['src/vendored/ujson/python/ujson.c', 'src/vendored/ujson/python/objToJSON.c', 'src/vendored/ujson/python/JSONtoObj.c', @@ -107,8 +95,7 @@ libs_sources = { 'reshape': {'sources': ['reshape.pyx']}, 'sas': {'sources': ['sas.pyx']}, 'byteswap': {'sources': ['byteswap.pyx']}, - 'sparse': {'sources': ['sparse.pyx'], - 'deps': [_sparse_pxi_dep]}, + 'sparse': {'sources': ['sparse.pyx', _sparse_op_helper]}, 'tslib': {'sources': ['tslib.pyx']}, 'testing': {'sources': ['testing.pyx']}, 'writers': {'sources': ['writers.pyx']} From 2a6d800df89a4878fbf432ae71f93e3803d67672 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 19 Mar 2024 17:29:17 +0100 Subject: [PATCH 140/396] Backport PR #57886 on branch 2.2.x (CI: Remove ASAN job) (#57910) Backport PR #57886: CI: Remove ASAN job Co-authored-by: William Ayd --- .github/actions/run-tests/action.yml | 9 +------- .github/workflows/unit-tests.yml | 14 ------------ ci/deps/actions-311-sanitizers.yaml | 32 ---------------------------- 3 files changed, 1 insertion(+), 54 deletions(-) delete mode 100644 ci/deps/actions-311-sanitizers.yaml diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index b4778b74df335..fd7c3587f2254 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -1,16 +1,9 @@ name: Run tests and report results -inputs: - preload: - description: Preload arguments for sanitizer - required: false - asan_options: - description: Arguments for Address Sanitizer (ASAN) - required: false runs: using: composite steps: - name: Test - run: ${{ inputs.asan_options }} ${{ inputs.preload }} ci/run_tests.sh + run: ci/run_tests.sh shell: bash -el {0} - name: Publish test results diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 8736674bbf965..bacc3d874a60d 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -96,14 +96,6 @@ jobs: - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" - - name: "ASAN / UBSAN" - env_file: actions-311-sanitizers.yaml - pattern: "not slow and not network and not single_cpu and not skip_ubsan" - asan_options: "ASAN_OPTIONS=detect_leaks=0" - preload: LD_PRELOAD=$(gcc -print-file-name=libasan.so) - meson_args: --config-settings=setup-args="-Db_sanitize=address,undefined" - cflags_adds: -fno-sanitize-recover=all - pytest_workers: -1 # disable pytest-xdist as it swallows stderr from ASAN fail-fast: false name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} env: @@ -190,18 +182,12 @@ jobs: - name: Test (not single_cpu) uses: ./.github/actions/run-tests if: ${{ matrix.name != 'Pypy' }} - with: - preload: ${{ matrix.preload }} - asan_options: ${{ matrix.asan_options }} env: # Set pattern to not single_cpu if not already set PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} - name: Test (single_cpu) uses: ./.github/actions/run-tests - with: - preload: ${{ matrix.preload }} - asan_options: ${{ matrix.asan_options }} env: PATTERN: 'single_cpu' PYTEST_WORKERS: 0 diff --git a/ci/deps/actions-311-sanitizers.yaml b/ci/deps/actions-311-sanitizers.yaml deleted file mode 100644 index f5f04c90bffad..0000000000000 --- a/ci/deps/actions-311-sanitizers.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.11 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - meson[ninja]=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 - - boto3 - - hypothesis>=6.46.1 - - pyqt>=5.15.9 - - # required dependencies - - python-dateutil - - numpy - - pytz - - # pandas dependencies - - pip - - - pip: - - "tzdata>=2022.7" From 78f7a02d1f36cdcdf5c3420afbf4965a85db7f01 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 21 Mar 2024 04:05:31 +0100 Subject: [PATCH 141/396] Backport PR #57029 on branch 2.2.x (DOC: Add `DataFrame.to_numpy` method) (#57940) Backport PR #57029: DOC: Add `DataFrame.to_numpy` method Co-authored-by: Zhengbo Wang <77875500+luke396@users.noreply.github.com> --- doc/source/reference/frame.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index fefb02dd916cd..1d9019ff22c23 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -49,6 +49,7 @@ Conversion DataFrame.infer_objects DataFrame.copy DataFrame.bool + DataFrame.to_numpy Indexing, iteration ~~~~~~~~~~~~~~~~~~~ From 7e8d492b26bb933884492ffcb5ce8fc501c281d5 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 21 Mar 2024 16:37:58 +0000 Subject: [PATCH 142/396] Backport PR #57764 on branch 2.2.x (BUG: PyArrow dtypes were not supported in the interchange protocol) (#57947) --- doc/source/whatsnew/v2.2.2.rst | 4 +- pandas/core/interchange/buffer.py | 58 +++++++ pandas/core/interchange/column.py | 66 ++++++-- pandas/core/interchange/dataframe.py | 5 + pandas/core/interchange/from_dataframe.py | 17 +- pandas/core/interchange/utils.py | 28 ++++ pandas/tests/interchange/test_impl.py | 186 +++++++++++++++++++--- 7 files changed, 326 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 96f210ce6b7b9..54084abab7817 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`) - .. --------------------------------------------------------------------------- @@ -21,7 +22,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`) +- :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`) .. --------------------------------------------------------------------------- .. _whatsnew_222.other: diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py index 5c97fc17d7070..5d24325e67f62 100644 --- a/pandas/core/interchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -12,6 +12,7 @@ if TYPE_CHECKING: import numpy as np + import pyarrow as pa class PandasBuffer(Buffer): @@ -76,3 +77,60 @@ def __repr__(self) -> str: ) + ")" ) + + +class PandasBufferPyarrow(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__( + self, + buffer: pa.Buffer, + *, + length: int, + ) -> None: + """ + Handle pyarrow chunked arrays. + """ + self._buffer = buffer + self._length = length + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._buffer.size + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._buffer.address + + def __dlpack__(self) -> Any: + """ + Represent this structure as DLPack interface. + """ + raise NotImplementedError() + + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + return ( + "PandasBuffer[pyarrow](" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": "CPU", + } + ) + + ")" + ) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 7b39403ca1916..d59a3df694bb3 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import Any +from typing import ( + TYPE_CHECKING, + Any, +) import numpy as np @@ -9,15 +12,18 @@ from pandas.errors import NoBufferPresent from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.dtypes import ( +from pandas.core.dtypes.dtypes import BaseMaskedDtype + +import pandas as pd +from pandas import ( ArrowDtype, - BaseMaskedDtype, DatetimeTZDtype, ) - -import pandas as pd from pandas.api.types import is_string_dtype -from pandas.core.interchange.buffer import PandasBuffer +from pandas.core.interchange.buffer import ( + PandasBuffer, + PandasBufferPyarrow, +) from pandas.core.interchange.dataframe_protocol import ( Column, ColumnBuffers, @@ -30,6 +36,9 @@ dtype_to_arrow_c_fmt, ) +if TYPE_CHECKING: + from pandas.core.interchange.dataframe_protocol import Buffer + _NP_KINDS = { "i": DtypeKind.INT, "u": DtypeKind.UINT, @@ -157,6 +166,16 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: else: byteorder = dtype.byteorder + if dtype == "bool[pyarrow]": + # return early to avoid the `* 8` below, as this is a bitmask + # rather than a bytemask + return ( + kind, + dtype.itemsize, # pyright: ignore[reportGeneralTypeIssues] + ArrowCTypes.BOOL, + byteorder, + ) + return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder @property @@ -194,6 +213,12 @@ def describe_null(self): column_null_dtype = ColumnNullType.USE_BYTEMASK null_value = 1 return column_null_dtype, null_value + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + if self._col.array._pa_array.chunks[0].buffers()[0] is None: # type: ignore[attr-defined] + return ColumnNullType.NON_NULLABLE, None + return ColumnNullType.USE_BITMASK, 0 kind = self.dtype[0] try: null, value = _NULL_DESCRIPTION[kind] @@ -278,10 +303,11 @@ def get_buffers(self) -> ColumnBuffers: def _get_data_buffer( self, - ) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple + ) -> tuple[Buffer, tuple[DtypeKind, int, str, str]]: """ Return the buffer containing the data and the buffer's associated dtype. """ + buffer: Buffer if self.dtype[0] in ( DtypeKind.INT, DtypeKind.UINT, @@ -291,6 +317,7 @@ def _get_data_buffer( ): # self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make # it longer than 4 characters + dtype = self.dtype if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4: np_arr = self._col.dt.tz_convert(None).to_numpy() else: @@ -298,11 +325,17 @@ def _get_data_buffer( if isinstance(self._col.dtype, BaseMaskedDtype): np_arr = arr._data # type: ignore[attr-defined] elif isinstance(self._col.dtype, ArrowDtype): - raise NotImplementedError("ArrowDtype not handled yet") + # We already rechunk (if necessary / allowed) upon initialization, + # so this is already single-chunk by the time we get here. + arr = arr._pa_array.chunks[0] # type: ignore[attr-defined] + buffer = PandasBufferPyarrow( + arr.buffers()[1], # type: ignore[attr-defined] + length=len(arr), + ) + return buffer, dtype else: np_arr = arr._ndarray # type: ignore[attr-defined] buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) - dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: codes = self._col.values._codes buffer = PandasBuffer(codes, allow_copy=self._allow_copy) @@ -330,13 +363,26 @@ def _get_data_buffer( return buffer, dtype - def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: + def _get_validity_buffer(self) -> tuple[Buffer, Any] | None: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. Raises NoBufferPresent if null representation is not a bit or byte mask. """ null, invalid = self.describe_null + buffer: Buffer + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + arr = self._col.array._pa_array.chunks[0] # type: ignore[attr-defined] + dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE) + if arr.buffers()[0] is None: + return None + buffer = PandasBufferPyarrow( + arr.buffers()[0], + length=len(arr), + ) + return buffer, dtype if isinstance(self._col.dtype, BaseMaskedDtype): mask = self._col.array._mask # type: ignore[attr-defined] diff --git a/pandas/core/interchange/dataframe.py b/pandas/core/interchange/dataframe.py index 1ffe0e8e8dbb0..1abacddfc7e3b 100644 --- a/pandas/core/interchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -5,6 +5,7 @@ from pandas.core.interchange.column import PandasColumn from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg +from pandas.core.interchange.utils import maybe_rechunk if TYPE_CHECKING: from collections.abc import ( @@ -34,6 +35,10 @@ def __init__(self, df: DataFrame, allow_copy: bool = True) -> None: """ self._df = df.rename(columns=str, copy=False) self._allow_copy = allow_copy + for i, _col in enumerate(self._df.columns): + rechunked = maybe_rechunk(self._df.iloc[:, i], allow_copy=allow_copy) + if rechunked is not None: + self._df.isetitem(i, rechunked) def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index d45ae37890ba7..4162ebc33f0d6 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -295,13 +295,14 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: null_pos = None if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): - assert buffers["validity"], "Validity buffers cannot be empty for masks" - valid_buff, valid_dtype = buffers["validity"] - null_pos = buffer_to_ndarray( - valid_buff, valid_dtype, offset=col.offset, length=col.size() - ) - if sentinel_val == 0: - null_pos = ~null_pos + validity = buffers["validity"] + if validity is not None: + valid_buff, valid_dtype = validity + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) + if sentinel_val == 0: + null_pos = ~null_pos # Assemble the strings from the code units str_list: list[None | float | str] = [None] * col.size() @@ -486,6 +487,8 @@ def set_nulls( np.ndarray or pd.Series Data with the nulls being set. """ + if validity is None: + return data null_kind, sentinel_val = col.describe_null null_pos = None diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 2e73e560e5740..2a19dd5046aa3 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -16,6 +16,8 @@ DatetimeTZDtype, ) +import pandas as pd + if typing.TYPE_CHECKING: from pandas._typing import DtypeObj @@ -145,3 +147,29 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: raise NotImplementedError( f"Conversion of {dtype} to Arrow C format string is not implemented." ) + + +def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None: + """ + Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary. + + - Returns `None` if the input series is not backed by a multi-chunk pyarrow array + (and so doesn't need rechunking) + - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk + pyarrow array and `allow_copy` is `True`. + - Raises a `RuntimeError` if `allow_copy` is `False` and input is a + based by a multi-chunk pyarrow array. + """ + if not isinstance(series.dtype, pd.ArrowDtype): + return None + chunked_array = series.array._pa_array # type: ignore[attr-defined] + if len(chunked_array.chunks) == 1: + return None + if not allow_copy: + raise RuntimeError( + "Found multi-chunk pyarrow array, but `allow_copy` is False. " + "Please rechunk the array before calling this function, or set " + "`allow_copy=True`." + ) + arr = chunked_array.combine_chunks() + return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index a1dedb6be456c..1ccada9116d4c 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -1,4 +1,7 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest @@ -301,6 +304,51 @@ def test_multi_chunk_pyarrow() -> None: pd.api.interchange.from_dataframe(table, allow_copy=False) +def test_multi_chunk_column() -> None: + pytest.importorskip("pyarrow", "11.0.0") + ser = pd.Series([1, 2, None], dtype="Int64[pyarrow]") + df = pd.concat([ser, ser], ignore_index=True).to_frame("a") + df_orig = df.copy() + with pytest.raises( + RuntimeError, match="Found multi-chunk pyarrow array, but `allow_copy` is False" + ): + pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=False)) + result = pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=True)) + # Interchange protocol defaults to creating numpy-backed columns, so currently this + # is 'float64'. + expected = pd.DataFrame({"a": [1.0, 2.0, None, 1.0, 2.0, None]}, dtype="float64") + tm.assert_frame_equal(result, expected) + + # Check that the rechunking we did didn't modify the original DataFrame. + tm.assert_frame_equal(df, df_orig) + assert len(df["a"].array._pa_array.chunks) == 2 + assert len(df_orig["a"].array._pa_array.chunks) == 2 + + +def test_timestamp_ns_pyarrow(): + # GH 56712 + pytest.importorskip("pyarrow", "11.0.0") + timestamp_args = { + "year": 2000, + "month": 1, + "day": 1, + "hour": 1, + "minute": 1, + "second": 1, + } + df = pd.Series( + [datetime(**timestamp_args)], + dtype="timestamp[ns][pyarrow]", + name="col0", + ).to_frame() + + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi)["col0"].item() + + expected = pd.Timestamp(**timestamp_args) + assert result == expected + + @pytest.mark.parametrize("tz", ["UTC", "US/Pacific"]) @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_datetimetzdtype(tz, unit): @@ -403,42 +451,60 @@ def test_non_str_names_w_duplicates(): pd.api.interchange.from_dataframe(dfi, allow_copy=False) -def test_nullable_integers() -> None: - # https://github.com/pandas-dev/pandas/issues/55069 - df = pd.DataFrame({"a": [1]}, dtype="Int8") - expected = pd.DataFrame({"a": [1]}, dtype="int8") - result = pd.api.interchange.from_dataframe(df.__dataframe__()) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/57664") -def test_nullable_integers_pyarrow() -> None: - # https://github.com/pandas-dev/pandas/issues/55069 - df = pd.DataFrame({"a": [1]}, dtype="Int8[pyarrow]") - expected = pd.DataFrame({"a": [1]}, dtype="int8") - result = pd.api.interchange.from_dataframe(df.__dataframe__()) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( ("data", "dtype", "expected_dtype"), [ ([1, 2, None], "Int64", "int64"), + ([1, 2, None], "Int64[pyarrow]", "int64"), + ([1, 2, None], "Int8", "int8"), + ([1, 2, None], "Int8[pyarrow]", "int8"), ( [1, 2, None], "UInt64", "uint64", ), + ( + [1, 2, None], + "UInt64[pyarrow]", + "uint64", + ), ([1.0, 2.25, None], "Float32", "float32"), + ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), + ([True, False, None], "boolean[pyarrow]", "bool"), + (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", None], "string[pyarrow]", "large_string"), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), None], + "timestamp[ns][pyarrow]", + "timestamp[ns]", + ), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), None], + "timestamp[us][pyarrow]", + "timestamp[us]", + ), + ( + [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + None, + ], + "timestamp[us, Asia/Kathmandu][pyarrow]", + "timestamp[us, tz=Asia/Kathmandu]", + ), ], ) -def test_pandas_nullable_w_missing_values( +def test_pandas_nullable_with_missing_values( data: list, dtype: str, expected_dtype: str ) -> None: # https://github.com/pandas-dev/pandas/issues/57643 - pytest.importorskip("pyarrow", "11.0.0") + # https://github.com/pandas-dev/pandas/issues/57664 + pa = pytest.importorskip("pyarrow", "11.0.0") import pyarrow.interchange as pai + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": + expected_dtype = pa.timestamp("us", "Asia/Kathmandu") + df = pd.DataFrame({"a": data}, dtype=dtype) result = pai.from_dataframe(df.__dataframe__())["a"] assert result.type == expected_dtype @@ -447,6 +513,86 @@ def test_pandas_nullable_w_missing_values( assert result[2].as_py() is None +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, 3], "Int64", "int64"), + ([1, 2, 3], "Int64[pyarrow]", "int64"), + ([1, 2, 3], "Int8", "int8"), + ([1, 2, 3], "Int8[pyarrow]", "int8"), + ( + [1, 2, 3], + "UInt64", + "uint64", + ), + ( + [1, 2, 3], + "UInt64[pyarrow]", + "uint64", + ), + ([1.0, 2.25, 5.0], "Float32", "float32"), + ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), + ([True, False, False], "boolean[pyarrow]", "bool"), + (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "timestamp[ns][pyarrow]", + "timestamp[ns]", + ), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "timestamp[us][pyarrow]", + "timestamp[us]", + ), + ( + [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + datetime(2020, 1, 3, tzinfo=timezone.utc), + ], + "timestamp[us, Asia/Kathmandu][pyarrow]", + "timestamp[us, tz=Asia/Kathmandu]", + ), + ], +) +def test_pandas_nullable_without_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + pa = pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": + expected_dtype = pa.timestamp("us", "Asia/Kathmandu") + + df = pd.DataFrame({"a": data}, dtype=dtype) + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] + assert result[2].as_py() == data[2] + + +def test_string_validity_buffer() -> None: + # https://github.com/pandas-dev/pandas/issues/57761 + pytest.importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert result is None + + +def test_string_validity_buffer_no_missing() -> None: + # https://github.com/pandas-dev/pandas/issues/57762 + pytest.importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x", None]}, dtype="large_string[pyarrow]") + validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert validity is not None + result = validity[1] + expected = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, "=") + assert result == expected + + def test_empty_dataframe(): # https://github.com/pandas-dev/pandas/issues/56700 df = pd.DataFrame({"a": []}, dtype="int8") From 40e621fc9f77d01e80516a8ca99294386a438958 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 27 Mar 2024 18:51:23 +0100 Subject: [PATCH 143/396] Backport PR #57548 on branch 2.2.x (Fix accidental loss-of-precision for to_datetime(str, unit=...)) (#58034) Backport PR #57548: Fix accidental loss-of-precision for to_datetime(str, unit=...) Co-authored-by: Elliott Sales de Andrade --- doc/source/whatsnew/v2.2.2.rst | 2 +- pandas/_libs/tslib.pyx | 2 +- pandas/tests/tools/test_to_datetime.py | 8 ++++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 54084abab7817..19539918b8c8f 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -15,7 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`) -- +- Fixed regression in precision of :func:`to_datetime` with string and ``unit`` input (:issue:`57051`) .. --------------------------------------------------------------------------- .. _whatsnew_222.bug_fixes: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 017fdc4bc834f..dd23c2f27ca09 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -277,7 +277,7 @@ def array_with_unit_to_datetime( bint is_raise = errors == "raise" ndarray[int64_t] iresult tzinfo tz = None - float fval + double fval assert is_ignore or is_coerce or is_raise diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 6791ac0340640..a1ed996dade8e 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1912,6 +1912,14 @@ def test_unit(self, cache): with pytest.raises(ValueError, match=msg): to_datetime([1], unit="D", format="%Y%m%d", cache=cache) + def test_unit_str(self, cache): + # GH 57051 + # Test that strs aren't dropping precision to 32-bit accidentally. + with tm.assert_produces_warning(FutureWarning): + res = to_datetime(["1704660000"], unit="s", origin="unix") + expected = to_datetime([1704660000], unit="s", origin="unix") + tm.assert_index_equal(res, expected) + def test_unit_array_mixed_nans(self, cache): values = [11111111111111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] result = to_datetime(values, unit="D", errors="ignore", cache=cache) From e1a7302c8c0b5d33c5bc57aa81ee371782ad9289 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 27 Mar 2024 20:02:51 +0100 Subject: [PATCH 144/396] Backport PR #57758 on branch 2.2.x (BUG: DataFrame Interchange Protocol errors on Boolean columns) (#58036) Backport PR #57758: BUG: DataFrame Interchange Protocol errors on Boolean columns Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.2.rst | 1 + pandas/core/interchange/utils.py | 3 +++ pandas/tests/interchange/test_impl.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 19539918b8c8f..d0f8951ac07ad 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the column's type was nullable boolean (:issue:`55332`) - :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`) - :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`) diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 2a19dd5046aa3..fd1c7c9639242 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -144,6 +144,9 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: elif isinstance(dtype, DatetimeTZDtype): return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz) + elif isinstance(dtype, pd.BooleanDtype): + return ArrowCTypes.BOOL + raise NotImplementedError( f"Conversion of {dtype} to Arrow C format string is not implemented." ) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 1ccada9116d4c..25418b8bb2b37 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -470,6 +470,7 @@ def test_non_str_names_w_duplicates(): ), ([1.0, 2.25, None], "Float32", "float32"), ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), + ([True, False, None], "boolean", "bool"), ([True, False, None], "boolean[pyarrow]", "bool"), (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"), (["much ado", "about", None], "string[pyarrow]", "large_string"), @@ -532,6 +533,7 @@ def test_pandas_nullable_with_missing_values( ), ([1.0, 2.25, 5.0], "Float32", "float32"), ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), + ([True, False, False], "boolean", "bool"), ([True, False, False], "boolean[pyarrow]", "bool"), (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"), (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), From f4554014bd15149e28fc5b77a3e30a88987ff8a3 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 28 Mar 2024 23:26:08 +0100 Subject: [PATCH 145/396] Backport PR #57974 on branch 2.2.x (BUG: Fixed ADBC to_sql creation of table when using public schema) (#58050) Backport PR #57974: BUG: Fixed ADBC to_sql creation of table when using public schema Co-authored-by: Shabab Karim --- doc/source/whatsnew/v2.2.2.rst | 1 + pandas/io/sql.py | 4 +++- pandas/tests/io/test_sql.py | 24 ++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index d0f8951ac07ad..9e1a883d47cf8 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -25,6 +25,7 @@ Bug fixes - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the column's type was nullable boolean (:issue:`55332`) - :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`) - :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`) +- :meth:`DataFrame.to_sql` was failing to find the right table when using the schema argument (:issue:`57539`) .. --------------------------------------------------------------------------- .. _whatsnew_222.other: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 195a7c5040853..3e17175167f25 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2400,7 +2400,9 @@ def to_sql( raise ValueError("datatypes not supported") from exc with self.con.cursor() as cur: - total_inserted = cur.adbc_ingest(table_name, tbl, mode=mode) + total_inserted = cur.adbc_ingest( + table_name=name, data=tbl, mode=mode, db_schema_name=schema + ) self.con.commit() return total_inserted diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 791b6da3deeca..4f1f965f26aa9 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1373,6 +1373,30 @@ def insert_on_conflict(table, conn, keys, data_iter): pandasSQL.drop_table("test_insert_conflict") +@pytest.mark.parametrize("conn", all_connectable) +def test_to_sql_on_public_schema(conn, request): + if "sqlite" in conn or "mysql" in conn: + request.applymarker( + pytest.mark.xfail( + reason="test for public schema only specific to postgresql" + ) + ) + + conn = request.getfixturevalue(conn) + + test_data = DataFrame([[1, 2.1, "a"], [2, 3.1, "b"]], columns=list("abc")) + test_data.to_sql( + name="test_public_schema", + con=conn, + if_exists="append", + index=False, + schema="public", + ) + + df_out = sql.read_sql_table("test_public_schema", conn, schema="public") + tm.assert_frame_equal(test_data, df_out) + + @pytest.mark.parametrize("conn", mysql_connectable) def test_insertion_method_on_conflict_update(conn, request): # GH 14553: Example in to_sql docstring From 810b2d032f1ba09da244ef913a49283c20d5c510 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 1 Apr 2024 11:22:03 -0700 Subject: [PATCH 146/396] Backport PR #57553 on branch 2.2.x (API: avoid passing Manager to subclass init) (#58008) * Backport PR #57553: API: avoid passing Manager to subclass __init__ * whatsnew, type ignores * merge 2.2.2 file from main * rebase on 2.2.x whatsnew --- pandas/core/frame.py | 45 +++++++++++++++++++---------- pandas/core/generic.py | 1 + pandas/core/resample.py | 3 +- pandas/core/series.py | 34 ++++++++++++---------- pandas/tests/frame/test_subclass.py | 11 +++++++ 5 files changed, 62 insertions(+), 32 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5c510d98596df..afcd4d014316e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -656,26 +656,37 @@ class DataFrame(NDFrame, OpsMixin): def _constructor(self) -> Callable[..., DataFrame]: return DataFrame - def _constructor_from_mgr(self, mgr, axes): - if self._constructor is DataFrame: - # we are pandas.DataFrame (or a subclass that doesn't override _constructor) - return DataFrame._from_mgr(mgr, axes=axes) - else: - assert axes is mgr.axes + def _constructor_from_mgr(self, mgr, axes) -> DataFrame: + df = DataFrame._from_mgr(mgr, axes=axes) + + if type(self) is DataFrame: + # This would also work `if self._constructor is DataFrame`, but + # this check is slightly faster, benefiting the most-common case. + return df + + elif type(self).__name__ == "GeoDataFrame": + # Shim until geopandas can override their _constructor_from_mgr + # bc they have different behavior for Managers than for DataFrames return self._constructor(mgr) + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor(df) + _constructor_sliced: Callable[..., Series] = Series - def _sliced_from_mgr(self, mgr, axes) -> Series: - return Series._from_mgr(mgr, axes) + def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: + ser = Series._from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name - def _constructor_sliced_from_mgr(self, mgr, axes): - if self._constructor_sliced is Series: - ser = self._sliced_from_mgr(mgr, axes) - ser._name = None # caller is responsible for setting real name + if type(self) is DataFrame: + # This would also work `if self._constructor_sliced is Series`, but + # this check is slightly faster, benefiting the most-common case. return ser - assert axes is mgr.axes - return self._constructor_sliced(mgr) + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor_sliced(ser) # ---------------------------------------------------------------------- # Constructors @@ -1403,7 +1414,8 @@ def _get_values_for_csv( na_rep=na_rep, quoting=quoting, ) - return self._constructor_from_mgr(mgr, axes=mgr.axes) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes) # type: ignore[return-value] # ---------------------------------------------------------------------- @@ -5077,7 +5089,8 @@ def predicate(arr: ArrayLike) -> bool: return True mgr = self._mgr._get_data_subset(predicate).copy(deep=None) - return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) # type: ignore[return-value] def insert( self, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2a86f75badecd..796357355fef4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -336,6 +336,7 @@ def _as_manager(self, typ: str, copy: bool_t = True) -> Self: # fastpath of passing a manager doesn't check the option/manager class return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self) + @final @classmethod def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self: """ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 2d430ef4dcff6..0dd808a0ab296 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2548,7 +2548,8 @@ def _take_new_index( if axis == 1: raise NotImplementedError("axis 1 is not supported") new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) - return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + # error: Incompatible return value type (got "DataFrame", expected "NDFrameT") + return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes) # type: ignore[return-value] else: raise ValueError("'obj' should be either a Series or a DataFrame") diff --git a/pandas/core/series.py b/pandas/core/series.py index c1782206d4b67..6fd019656d207 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -662,14 +662,17 @@ def _constructor(self) -> Callable[..., Series]: return Series def _constructor_from_mgr(self, mgr, axes): - if self._constructor is Series: - # we are pandas.Series (or a subclass that doesn't override _constructor) - ser = Series._from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name + ser = Series._from_mgr(mgr, axes=axes) + ser._name = None # caller is responsible for setting real name + + if type(self) is Series: + # This would also work `if self._constructor is Series`, but + # this check is slightly faster, benefiting the most-common case. return ser - else: - assert axes is mgr.axes - return self._constructor(mgr) + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor(ser) @property def _constructor_expanddim(self) -> Callable[..., DataFrame]: @@ -681,18 +684,19 @@ def _constructor_expanddim(self) -> Callable[..., DataFrame]: return DataFrame - def _expanddim_from_mgr(self, mgr, axes) -> DataFrame: + def _constructor_expanddim_from_mgr(self, mgr, axes): from pandas.core.frame import DataFrame - return DataFrame._from_mgr(mgr, axes=mgr.axes) + df = DataFrame._from_mgr(mgr, axes=mgr.axes) - def _constructor_expanddim_from_mgr(self, mgr, axes): - from pandas.core.frame import DataFrame + if type(self) is Series: + # This would also work `if self._constructor_expanddim is DataFrame`, + # but this check is slightly faster, benefiting the most-common case. + return df - if self._constructor_expanddim is DataFrame: - return self._expanddim_from_mgr(mgr, axes) - assert axes is mgr.axes - return self._constructor_expanddim(mgr) + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor_expanddim(df) # types @property diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index ef78ae62cb4d6..855b58229cbdb 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -26,6 +26,17 @@ def _constructor(self): class TestDataFrameSubclassing: + def test_no_warning_on_mgr(self): + # GH#57032 + df = tm.SubclassedDataFrame( + {"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"] + ) + with tm.assert_produces_warning(None): + # df.isna() goes through _constructor_from_mgr, which we want to + # *not* pass a Manager do __init__ + df.isna() + df["X"].isna() + def test_frame_subclassing_and_slicing(self): # Subclass frame and ensure it returns the right class on slicing it # In reference to PR 9632 From 822d285a5e901bff7aea6406ee46131753dbd6ee Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 1 Apr 2024 23:23:48 +0200 Subject: [PATCH 147/396] Backport PR #58075 on branch 2.2.x (DOC: whatsnew note for #57553) (#58080) Backport PR #58075: DOC: whatsnew note for #57553 Co-authored-by: jbrockmendel --- doc/source/whatsnew/v2.2.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 9e1a883d47cf8..0dac3660c76b2 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -15,6 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`) +- Avoid issuing a spurious ``DeprecationWarning`` when a custom :class:`DataFrame` or :class:`Series` subclass method is called (:issue:`57553`) - Fixed regression in precision of :func:`to_datetime` with string and ``unit`` input (:issue:`57051`) .. --------------------------------------------------------------------------- From e9b81ee3ecca2720f59e21091528e1e1e7eafe9a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 2 Apr 2024 22:57:16 -0400 Subject: [PATCH 148/396] Backport PR #58126: BLD: Build wheels with numpy 2.0rc1 (#58127) --- pyproject.toml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c225ed80dcb10..b2764b137a1f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,12 +6,9 @@ requires = [ "meson==1.2.1", "wheel", "Cython==3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json - # Any NumPy version should be fine for compiling. Users are unlikely - # to get a NumPy<1.25 so the result will be compatible with all relevant - # NumPy versions (if not it is presumably compatible with their version). - # Pin <2.0 for releases until tested against an RC. But explicitly allow - # testing the `.dev0` nightlies (which require the extra index). - "numpy>1.22.4,<=2.0.0.dev0", + # Force numpy higher than 2.0rc1, so that built wheels are compatible + # with both numpy 1 and 2 + "numpy>=2.0.0rc1", "versioneer[toml]" ] From 0f83d50c477f8aafb61a3db1d310d0b5fd261adc Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 3 Apr 2024 13:28:27 -0400 Subject: [PATCH 149/396] Revert "BLD: Pin numpy on 2.2.x" (#58093) Revert "BLD: Pin numpy on 2.2.x (#56812)" This reverts commit 24ea67fcf0cf982d011d249f2a711ef178e13065. --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b2764b137a1f8..778146bbcd909 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,9 +27,9 @@ authors = [ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ - "numpy>=1.22.4,<2; python_version<'3.11'", - "numpy>=1.23.2,<2; python_version=='3.11'", - "numpy>=1.26.0,<2; python_version>='3.12'", + "numpy>=1.22.4; python_version<'3.11'", + "numpy>=1.23.2; python_version=='3.11'", + "numpy>=1.26.0; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.7" From b56842d93dc76dc2a83b7ab640af6a419697decb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 3 Apr 2024 21:29:53 +0200 Subject: [PATCH 150/396] Backport PR #58100 on branch 2.2.x (MNT: fix compatibility with beautifulsoup4 4.13.0b2) (#58137) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #58100: MNT: fix compatibility with beautifulsoup4 4.13.0b2 Co-authored-by: Clément Robert --- pandas/io/html.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 26e71c9546ffd..4eeeb1b655f8a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -591,14 +591,8 @@ class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): :class:`pandas.io.html._HtmlFrameParser`. """ - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - from bs4 import SoupStrainer - - self._strainer = SoupStrainer("table") - def _parse_tables(self, document, match, attrs): - element_name = self._strainer.name + element_name = "table" tables = document.find_all(element_name, attrs=attrs) if not tables: raise ValueError("No tables found") From a947587c8ad419e9cf8f6cce1fd5ad80c32d759f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 Apr 2024 00:02:48 +0200 Subject: [PATCH 151/396] Backport PR #58138 on branch 2.2.x (BLD: Fix nightlies not building) (#58140) Backport PR #58138: BLD: Fix nightlies not building Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/wheels.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 470c044d2e99e..b9bfc766fb45c 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -139,8 +139,7 @@ jobs: shell: bash -el {0} run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - - name: Build normal wheels - if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} + - name: Build wheels uses: pypa/cibuildwheel@v2.17.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} From 691fc88fbd7b570017bbc05fa77754b24c1fa997 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Mon, 8 Apr 2024 23:34:15 +0200 Subject: [PATCH 152/396] Backport PR #58181 on branch 2.2.x (CI: correct error msg in test_view_index) (#58187) Backport PR #58181: CI: correct error msg in `test_view_index` --- pandas/tests/indexes/test_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 1fa48f98942c2..b7204d7af1cbb 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -358,7 +358,10 @@ def test_view_with_args_object_array_raises(self, index): with pytest.raises(NotImplementedError, match="i8"): index.view("i8") else: - msg = "Cannot change data-type for object array" + msg = ( + "Cannot change data-type for array of references|" + "Cannot change data-type for object array|" + ) with pytest.raises(TypeError, match=msg): index.view("i8") From c7ec5663b3a5f041d7e8c93fa3d75a1314477e27 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 10 Apr 2024 00:40:19 +0200 Subject: [PATCH 153/396] Backport PR #58087 on branch 2.2.x (BLD: Build wheels using numpy 2.0rc1) (#58105) Backport PR #58087: BLD: Build wheels using numpy 2.0rc1 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .circleci/config.yml | 4 ---- .github/workflows/wheels.yml | 12 ------------ pyproject.toml | 7 ++++++- 3 files changed, 6 insertions(+), 17 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ea93575ac9430..6f134c9a7a7bd 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -72,10 +72,6 @@ jobs: no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | pip3 install cibuildwheel==2.15.0 - # When this is a nightly wheel build, allow picking up NumPy 2.0 dev wheels: - if [[ "$IS_SCHEDULE_DISPATCH" == "true" || "$IS_PUSH" != 'true' ]]; then - export CIBW_ENVIRONMENT="PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - fi cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index b9bfc766fb45c..4bd9068e91b67 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -147,18 +147,6 @@ jobs: CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} - - name: Build nightly wheels (with NumPy pre-release) - if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.17.0 - with: - package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} - env: - # The nightly wheels should be build witht he NumPy 2.0 pre-releases - # which requires the additional URL. - CIBW_ENVIRONMENT: PIP_EXTRA_INDEX_URL=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple - CIBW_PRERELEASE_PYTHONS: True - CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} - - name: Set up Python uses: mamba-org/setup-micromamba@v1 with: diff --git a/pyproject.toml b/pyproject.toml index 778146bbcd909..db9f055799ab0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -153,6 +153,9 @@ setup = ['--vsenv'] # For Windows skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} +# TODO: remove this once numpy 2.0 proper releases +# and specify numpy 2.0 as a dependency in [build-system] requires in pyproject.toml +before-build = "pip install numpy==2.0.0rc1" test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" test-command = """ PANDAS_CI='1' python -c 'import pandas as pd; \ @@ -161,7 +164,9 @@ test-command = """ """ [tool.cibuildwheel.windows] -before-build = "pip install delvewheel" +# TODO: remove this once numpy 2.0 proper releases +# and specify numpy 2.0 as a dependency in [build-system] requires in pyproject.toml +before-build = "pip install delvewheel numpy==2.0.0rc1" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" [[tool.cibuildwheel.overrides]] From 45b0b32af03e65f8d9547c127f8694fb7c3e994f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 10 Apr 2024 14:06:47 +0200 Subject: [PATCH 154/396] Backport PR #58203 on branch 2.2.x (DOC: Add release date/contributors for 2.2.2) (#58206) Backport PR #58203: DOC: Add release date/contributors for 2.2.2 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 2 +- doc/source/whatsnew/v2.2.2.rst | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 310dd921e44f6..4db0069ec4b95 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -87,4 +87,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.2.0..v2.2.1|HEAD +.. contributors:: v2.2.0..v2.2.1 diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 0dac3660c76b2..589a868c850d3 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -1,6 +1,6 @@ .. _whatsnew_222: -What's new in 2.2.2 (April XX, 2024) +What's new in 2.2.2 (April 10, 2024) --------------------------------------- These are the changes in pandas 2.2.2. See :ref:`release` for a full changelog @@ -40,3 +40,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.2.1..v2.2.2|HEAD From 5466f15462784eaa26e7ac43fc382f3802ab3ed1 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 10 Apr 2024 09:01:08 -0400 Subject: [PATCH 155/396] =?UTF-8?q?Backport=20PR=20#58202:=20DOC/TST:=20Do?= =?UTF-8?q?cument=20numpy=202.0=20support=20and=20add=20tests=E2=80=A6=20(?= =?UTF-8?q?#58208)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #58202: DOC/TST: Document numpy 2.0 support and add tests for string array --- doc/source/whatsnew/v2.2.2.rst | 15 +++++++++++++++ pandas/tests/frame/test_constructors.py | 19 +++++++++++++++++++ pandas/tests/series/test_constructors.py | 19 +++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 589a868c850d3..72a2f84c4aaee 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -9,6 +9,21 @@ including other versions of pandas. {{ header }} .. --------------------------------------------------------------------------- + +.. _whatsnew_220.np2_compat: + +Pandas 2.2.2 is now compatible with numpy 2.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 2.2.2 is the first version of pandas that is generally compatible with the upcoming +numpy 2.0 release, and wheels for pandas 2.2.2 will work with both numpy 1.x and 2.x. + +One major caveat is that arrays created with numpy 2.0's new ``StringDtype`` will convert +to ``object`` dtyped arrays upon :class:`Series`/:class:`DataFrame` creation. +Full support for numpy 2.0's StringDtype is expected to land in pandas 3.0. + +As usual please report any bugs discovered to our `issue tracker `_ + .. _whatsnew_222.regressions: Fixed regressions diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index acd0675fd43ec..cae2f6e81d384 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,6 +24,7 @@ from pandas._config import using_pyarrow_string_dtype from pandas._libs import lib +from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -3118,6 +3119,24 @@ def test_columns_indexes_raise_on_sets(self): with pytest.raises(ValueError, match="columns cannot be a set"): DataFrame(data, columns={"a", "b", "c"}) + # TODO: make this not cast to object in pandas 3.0 + @pytest.mark.skipif( + not np_version_gt2, reason="StringDType only available in numpy 2 and above" + ) + @pytest.mark.parametrize( + "data", + [ + {"a": ["a", "b", "c"], "b": [1.0, 2.0, 3.0], "c": ["d", "e", "f"]}, + ], + ) + def test_np_string_array_object_cast(self, data): + from numpy.dtypes import StringDType + + data["a"] = np.array(data["a"], dtype=StringDType()) + res = DataFrame(data) + assert res["a"].dtype == np.object_ + assert (res["a"] == data["a"]).all() + def get1(obj): # TODO: make a helper in tm? if isinstance(obj, Series): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 4d3839553a0af..387be8398e4b2 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2191,6 +2191,25 @@ def test_series_constructor_infer_multiindex(self, container, data): multi = Series(data, index=indexes) assert isinstance(multi.index, MultiIndex) + # TODO: make this not cast to object in pandas 3.0 + @pytest.mark.skipif( + not np_version_gt2, reason="StringDType only available in numpy 2 and above" + ) + @pytest.mark.parametrize( + "data", + [ + ["a", "b", "c"], + ["a", "b", np.nan], + ], + ) + def test_np_string_array_object_cast(self, data): + from numpy.dtypes import StringDType + + arr = np.array(data, dtype=StringDType()) + res = Series(arr) + assert res.dtype == np.object_ + assert (res == data).all() + class TestSeriesConstructorInternals: def test_constructor_no_pandas_array(self, using_array_manager): From 98aeac9b1b559178ef4f6a0a112a09b1741d11d1 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 10 Apr 2024 12:42:52 -0400 Subject: [PATCH 156/396] Backport PR #58209: CI: Pin blosc to fix pytables (#58211) --- ci/deps/actions-310.yaml | 2 ++ ci/deps/actions-311-downstream_compat.yaml | 2 ++ ci/deps/actions-311.yaml | 2 ++ ci/deps/actions-312.yaml | 2 ++ ci/deps/actions-39-minimum_versions.yaml | 2 ++ ci/deps/actions-39.yaml | 2 ++ ci/deps/circle-310-arm64.yaml | 2 ++ environment.yml | 2 ++ scripts/generate_pip_deps_from_conda.py | 2 +- scripts/validate_min_versions_in_sync.py | 5 ++++- 10 files changed, 21 insertions(+), 2 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index a3e44e6373145..ea2336ae78f81 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -24,6 +24,8 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index d6bf9ec7843de..8f84a53b58610 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -26,6 +26,8 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 95cd1a4d46ef4..51a246ce73a11 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -24,6 +24,8 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index a442ed6feeb5d..7d2b9c39d2fe3 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -24,6 +24,8 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 7067048c4434d..cedf4fb9dc867 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -27,6 +27,8 @@ dependencies: # optional dependencies - beautifulsoup4=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 - blosc=1.21.3 - bottleneck=1.3.6 - fastparquet=2022.12.0 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index b162a78e7f115..85f2a74e849ee 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -24,6 +24,8 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index a19ffd485262d..c018ad94e7f30 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -25,6 +25,8 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/environment.yml b/environment.yml index 58eb69ad1f070..7f2db06d4d50e 100644 --- a/environment.yml +++ b/environment.yml @@ -27,6 +27,8 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 - blosc - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 5fcf09cd073fe..bf38d2fa419d1 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -23,7 +23,7 @@ import tomli as tomllib import yaml -EXCLUDE = {"python", "c-compiler", "cxx-compiler"} +EXCLUDE = {"python", "c-compiler", "cxx-compiler", "c-blosc2"} REMAP_VERSION = {"tzdata": "2022.7"} CONDA_TO_PIP = { "pytables": "tables", diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index 7dd3e96e6ec18..62a92cdd10ebc 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -36,7 +36,7 @@ SETUP_PATH = pathlib.Path("pyproject.toml").resolve() YAML_PATH = pathlib.Path("ci/deps") ENV_PATH = pathlib.Path("environment.yml") -EXCLUDE_DEPS = {"tzdata", "blosc", "pandas-gbq", "pyqt", "pyqt5"} +EXCLUDE_DEPS = {"tzdata", "blosc", "c-blosc2", "pandas-gbq", "pyqt", "pyqt5"} EXCLUSION_LIST = frozenset(["python=3.8[build=*_pypy]"]) # pandas package is not available # in pre-commit environment @@ -225,6 +225,9 @@ def get_versions_from_ci(content: list[str]) -> tuple[dict[str, str], dict[str, seen_required = True elif "# optional dependencies" in line: seen_optional = True + elif "#" in line: + # just a comment + continue elif "- pip:" in line: continue elif seen_required and line.strip(): From d9cdd2ee5a58015ef6f4d15c7226110c9aab8140 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Wed, 10 Apr 2024 13:42:52 -0400 Subject: [PATCH 157/396] RLS: 2.2.2 From c26952559669d69d2a32bc8bc27c6869a0323745 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 15 Apr 2024 23:01:12 +0200 Subject: [PATCH 158/396] Backport PR #58268 on branch 2.2.x (CI/TST: Unxfail test_slice_locs_negative_step Pyarrow test) (#58269) Backport PR #58268: CI/TST: Unxfail test_slice_locs_negative_step Pyarrow test Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/indexes/object/test_indexing.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 443cacf94d239..ebf9dac715f8d 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -7,7 +7,6 @@ NA, is_matching_na, ) -from pandas.compat import pa_version_under16p0 import pandas.util._test_decorators as td import pandas as pd @@ -201,16 +200,7 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype, request): - if ( - not pa_version_under16p0 - and dtype == "string[pyarrow_numpy]" - and in_slice == slice("a", "a", -1) - ): - request.applymarker( - pytest.mark.xfail(reason="https://github.com/apache/arrow/issues/40642") - ) - + def test_slice_locs_negative_step(self, in_slice, expected, dtype): index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) From 294b2156e56e73e1a2395db46d88578c1336349d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 17 Apr 2024 22:00:57 +0200 Subject: [PATCH 159/396] Backport PR #58293 on branch 2.2.x (CI: Pin docutils to < 0.21) (#58295) Backport PR #58293: CI: Pin docutils to < 0.21 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- environment.yml | 1 + requirements-dev.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 7f2db06d4d50e..aef3ce66ff352 100644 --- a/environment.yml +++ b/environment.yml @@ -90,6 +90,7 @@ dependencies: - numpydoc - pydata-sphinx-theme=0.14 - pytest-cython # doctest + - docutils < 0.21 # https://github.com/sphinx-doc/sphinx/issues/12302 - sphinx - sphinx-design - sphinx-copybutton diff --git a/requirements-dev.txt b/requirements-dev.txt index 5a63e59e1db88..c19ae8ea93bb5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -63,6 +63,7 @@ natsort numpydoc pydata-sphinx-theme==0.14 pytest-cython +docutils < 0.21 sphinx sphinx-design sphinx-copybutton From 35c237731d671e7ca5b7b1b3984f38a65795a896 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 24 Apr 2024 19:59:42 +0200 Subject: [PATCH 160/396] Backport PR #58389 on branch 2.2.x (CI: Fix npdev failures) (#58408) Backport PR #58389: CI: Fix npdev failures Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/arrays/test_datetimelike.py | 8 ++++++-- pandas/tests/extension/base/missing.py | 2 ++ pandas/tests/indexes/test_base.py | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 7f85c891afeed..4961123a7ca07 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -659,7 +659,9 @@ def test_array_interface(self, datetime_index): assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="datetime64[ns]") - assert result is not expected + if not np_version_gt2: + # TODO: GH 57739 + assert result is not expected tm.assert_numpy_array_equal(result, expected) # to object dtype @@ -974,7 +976,9 @@ def test_array_interface(self, timedelta_index): assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="timedelta64[ns]") - assert result is not expected + if not np_version_gt2: + # TODO: GH 57739 + assert result is not expected tm.assert_numpy_array_equal(result, expected) # to object dtype diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index dbd6682c12123..fb15b2dec869c 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -27,7 +27,9 @@ def test_isna_returns_copy(self, data_missing, na_func): expected = result.copy() mask = getattr(result, na_func)() if isinstance(mask.dtype, pd.SparseDtype): + # TODO: GH 57739 mask = np.array(mask) + mask.flags.writeable = True mask[:] = True tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index b7204d7af1cbb..7eeb626d91dc8 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -71,8 +71,8 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) - @pytest.mark.parametrize("index", ["string"], indirect=True) - def test_constructor_copy(self, index, using_infer_string): + def test_constructor_copy(self, using_infer_string): + index = Index(list("abc"), name="name") arr = np.array(index) new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) From ecb90b55263dc3523c9d453e0315a566c7f639a6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 6 May 2024 20:25:07 +0200 Subject: [PATCH 161/396] Backport PR #58590 on branch 2.2.x (BUG: Use large_string in string array consistently) (#58597) Backport PR #58590: BUG: Use large_string in string array consistently Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/string_arrow.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e8f614ff855c0..50527dace0b82 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -190,13 +190,13 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - return cls(pa.array(result, mask=na_values, type=pa.string())) + return cls(pa.array(result, mask=na_values, type=pa.large_string())) elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): - return cls(pc.cast(scalars, pa.string())) + return cls(pc.cast(scalars, pa.large_string())) # convert non-na-likes to str result = lib.ensure_string_array(scalars, copy=copy) - return cls(pa.array(result, type=pa.string(), from_pandas=True)) + return cls(pa.array(result, type=pa.large_string(), from_pandas=True)) @classmethod def _from_sequence_of_strings( @@ -239,7 +239,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: value_set = [ pa_scalar.as_py() for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] - if pa_scalar.type in (pa.string(), pa.null()) + if pa_scalar.type in (pa.string(), pa.null(), pa.large_string()) ] # short-circuit to return all False array. @@ -337,7 +337,9 @@ def _str_map( result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value ) - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + result = pa.array( + result, mask=mask, type=pa.large_string(), from_pandas=True + ) return type(self)(result) else: # This is when the result type is object. We reach this when @@ -658,7 +660,9 @@ def _str_map( result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value ) - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + result = pa.array( + result, mask=mask, type=pa.large_string(), from_pandas=True + ) return type(self)(result) else: # This is when the result type is object. We reach this when From d5e362e9782f5a9b8b4d82970f5af69b9051abd7 Mon Sep 17 00:00:00 2001 From: Mohammad Ahmadi Date: Thu, 9 May 2024 18:38:40 +0330 Subject: [PATCH 162/396] DOC: Fix typo in indexing.rst (#58653) Fix typo in "Returning a view versus a copy" section --- doc/source/user_guide/indexing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 4954ee1538697..ba5a5c7db614b 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1730,7 +1730,7 @@ Returning a view versus a copy .. warning:: :ref:`Copy-on-Write ` - will become the new default in pandas 3.0. This means than chained indexing will + will become the new default in pandas 3.0. This means that chained indexing will never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary anymore. See :ref:`this section ` From 967e1c7dbdf4b602dd69ee9cdae4b2908dc3d91c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 9 May 2024 22:14:26 +0200 Subject: [PATCH 163/396] Backport PR #58658 on branch 2.2.x (CI/TST: Don't xfail test_api_read_sql_duplicate_columns for pyarrow=16 and sqlite) (#58660) Backport PR #58658: CI/TST: Don't xfail test_api_read_sql_duplicate_columns for pyarrow=16 and sqlite Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/io/test_sql.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 4f1f965f26aa9..ab851b02c876a 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2320,9 +2320,15 @@ def test_api_escaped_table_name(conn, request): def test_api_read_sql_duplicate_columns(conn, request): # GH#53117 if "adbc" in conn: - request.node.add_marker( - pytest.mark.xfail(reason="pyarrow->pandas throws ValueError", strict=True) - ) + pa = pytest.importorskip("pyarrow") + if not ( + Version(pa.__version__) >= Version("16.0") and conn == "sqlite_adbc_conn" + ): + request.node.add_marker( + pytest.mark.xfail( + reason="pyarrow->pandas throws ValueError", strict=True + ) + ) conn = request.getfixturevalue(conn) if sql.has_table("test_table", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: From a8b5c5d223e76e13157ac3700f82a08b3eb84271 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 13 May 2024 23:40:05 +0200 Subject: [PATCH 164/396] Backport PR #58693 on branch 2.2.x (TST: Fix CI failures (don't xfail postgresql / don't xfail for pyarrow=16)) (#58709) Backport PR #58693: TST: Fix CI failures (don't xfail postgresql / don't xfail for pyarrow=16) Co-authored-by: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> --- pandas/tests/io/parser/test_multi_thread.py | 11 +++++++++-- pandas/tests/io/test_sql.py | 3 ++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index da9b9bddd30cd..704ca010f6506 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -12,6 +12,7 @@ import pandas as pd from pandas import DataFrame import pandas._testing as tm +from pandas.util.version import Version xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -23,10 +24,16 @@ ] -@xfail_pyarrow # ValueError: Found non-unique column index -def test_multi_thread_string_io_read_csv(all_parsers): +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_multi_thread_string_io_read_csv(all_parsers, request): # see gh-11786 parser = all_parsers + if parser.engine == "pyarrow": + pa = pytest.importorskip("pyarrow") + if Version(pa.__version__) < Version("16.0"): + request.applymarker( + pytest.mark.xfail(reason="# ValueError: Found non-unique column index") + ) max_row_range = 100 num_files = 10 diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index ab851b02c876a..7068247bbfa8b 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2322,7 +2322,8 @@ def test_api_read_sql_duplicate_columns(conn, request): if "adbc" in conn: pa = pytest.importorskip("pyarrow") if not ( - Version(pa.__version__) >= Version("16.0") and conn == "sqlite_adbc_conn" + Version(pa.__version__) >= Version("16.0") + and conn in ["sqlite_adbc_conn", "postgresql_adbc_conn"] ): request.node.add_marker( pytest.mark.xfail( From 6fd9558004402308849f0652069312baa74cb0d5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 14 May 2024 21:58:31 +0200 Subject: [PATCH 165/396] Backport PR #58719 on branch 2.2.x (CI: xfail test_to_xarray_index_types due to new 2024.5 release) (#58720) Backport PR #58719: CI: xfail test_to_xarray_index_types due to new 2024.5 release Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/generic/test_to_xarray.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index d8401a8b2ae3f..491f621783a76 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -9,6 +9,7 @@ date_range, ) import pandas._testing as tm +from pandas.util.version import Version pytest.importorskip("xarray") @@ -29,11 +30,17 @@ def df(self): } ) - def test_to_xarray_index_types(self, index_flat, df, using_infer_string): + def test_to_xarray_index_types(self, index_flat, df, using_infer_string, request): index = index_flat # MultiIndex is tested in test_to_xarray_with_multiindex if len(index) == 0: pytest.skip("Test doesn't make sense for empty index") + import xarray + + if Version(xarray.__version__) >= Version("2024.5"): + request.applymarker( + pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/9026") + ) from xarray import Dataset From 0eb84b35cc0d176f4e859bfc985c09433a597bc8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 13 Jun 2024 16:12:30 -0700 Subject: [PATCH 166/396] Backport PR #58992 on branch 2.2.x (PERF: cache plotting date locators for DatetimeIndex plotting) (#59002) Backport PR #58992: PERF: cache plotting date locators for DatetimeIndex plotting Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/plotting/_matplotlib/converter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 0eb3318ac96c5..9acb93ce69a9c 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -584,7 +584,8 @@ def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: return ppd, ppm, ppy -def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _daily_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # error: "BaseOffset" has no attribute "_period_dtype_code" dtype_code = freq._period_dtype_code # type: ignore[attr-defined] @@ -783,7 +784,8 @@ def _second_finder(label_interval: int) -> None: return info -def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _monthly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin @@ -854,7 +856,8 @@ def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _quarterly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) @@ -901,7 +904,8 @@ def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _annual_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _annual_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # Note: small difference here vs other finders in adding 1 to vmax (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 From 888295988e8cb8545a7f8b649e275255ffad7b9c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 14 Jun 2024 13:33:23 -0700 Subject: [PATCH 167/396] Backport PR #59013 on branch 2.2.x (CI: remove xfail in test_to_xarray_index_types ) (#59015) Backport PR #59013: CI: remove xfail in test_to_xarray_index_types Co-authored-by: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> --- pandas/tests/generic/test_to_xarray.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index 491f621783a76..d8401a8b2ae3f 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -9,7 +9,6 @@ date_range, ) import pandas._testing as tm -from pandas.util.version import Version pytest.importorskip("xarray") @@ -30,17 +29,11 @@ def df(self): } ) - def test_to_xarray_index_types(self, index_flat, df, using_infer_string, request): + def test_to_xarray_index_types(self, index_flat, df, using_infer_string): index = index_flat # MultiIndex is tested in test_to_xarray_with_multiindex if len(index) == 0: pytest.skip("Test doesn't make sense for empty index") - import xarray - - if Version(xarray.__version__) >= Version("2024.5"): - request.applymarker( - pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/9026") - ) from xarray import Dataset From 2a1417ad8fcd2850d69cf5ed68a73da3eadc6050 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 24 Jun 2024 13:43:20 -1000 Subject: [PATCH 168/396] Backport PR #59046: TST: Fix some test builds for numpy 2.0 (#59086) --- ci/deps/actions-311-pyarrownightly.yaml | 2 +- pandas/compat/__init__.py | 2 ++ pandas/compat/numpy/__init__.py | 2 +- pandas/compat/pyarrow.py | 2 ++ pandas/core/dtypes/cast.py | 13 +++++-------- pandas/tests/indexes/datetimelike_/test_indexing.py | 2 +- pandas/tests/io/test_parquet.py | 4 ++++ pandas/tests/scalar/timedelta/test_arithmetic.py | 2 +- pandas/tests/tools/test_to_datetime.py | 13 ++++++++++++- 9 files changed, 29 insertions(+), 13 deletions(-) diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index d84063ac2a9ba..5455b9b84b034 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -18,7 +18,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz - pip diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index eb890c8b8c0ab..5ada6d705172f 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -31,6 +31,7 @@ pa_version_under14p0, pa_version_under14p1, pa_version_under16p0, + pa_version_under17p0, ) if TYPE_CHECKING: @@ -188,6 +189,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under14p0", "pa_version_under14p1", "pa_version_under16p0", + "pa_version_under17p0", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 3014bd652d8c4..a06761d03887b 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -12,7 +12,7 @@ np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") np_version_gte1p25 = _nlv >= Version("1.25") -np_version_gt2 = _nlv >= Version("2.0.0.dev0") +np_version_gt2 = _nlv >= Version("2.0.0") is_numpy_dev = _nlv.dev is not None _min_numpy_ver = "1.22.4" diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index a2dfa69bbf236..457d26766520d 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -16,6 +16,7 @@ pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") + pa_version_under17p0 = _palv < Version("17.0.0") except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -25,3 +26,4 @@ pa_version_under14p1 = True pa_version_under15p0 = True pa_version_under16p0 = True + pa_version_under17p0 = True diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7dd81ec59bc49..b72293b52df06 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -39,7 +39,6 @@ is_supported_dtype, ) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 -from pandas.compat.numpy import np_version_gt2 from pandas.errors import ( IntCastingNaNError, LossySetitemError, @@ -1647,13 +1646,11 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n with warnings.catch_warnings(): # We already disallow dtype=uint w/ negative numbers # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. - if not np_version_gt2: - warnings.filterwarnings( - "ignore", - "NumPy will stop allowing conversion of " - "out-of-bound Python int", - DeprecationWarning, - ) + warnings.filterwarnings( + "ignore", + "NumPy will stop allowing conversion of out-of-bound Python int", + DeprecationWarning, + ) casted = np.asarray(arr, dtype=dtype) else: with warnings.catch_warnings(): diff --git a/pandas/tests/indexes/datetimelike_/test_indexing.py b/pandas/tests/indexes/datetimelike_/test_indexing.py index ee7128601256a..7b2c81aaf17de 100644 --- a/pandas/tests/indexes/datetimelike_/test_indexing.py +++ b/pandas/tests/indexes/datetimelike_/test_indexing.py @@ -19,7 +19,7 @@ @pytest.mark.parametrize("ldtype", dtlike_dtypes) @pytest.mark.parametrize("rdtype", dtlike_dtypes) def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): - vals = np.tile(3600 * 10**9 * np.arange(3), 2) + vals = np.tile(3600 * 10**9 * np.arange(3, dtype=np.int64), 2) def construct(dtype): if dtype is dtlike_dtypes[-1]: diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index e4b94177eedb2..2874279add3e6 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -16,6 +16,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, + pa_version_under17p0, ) import pandas as pd @@ -1063,6 +1064,9 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) + @pytest.mark.xfail( + pa_version_under17p0, reason="pa.pandas_compat passes 'datetime64' to .astype" + ) def test_columns_dtypes_not_invalid(self, pa): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index d2fa0f722ca6f..4fc59880c49dd 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -418,7 +418,7 @@ def test_td_mul_numeric_ndarray(self): def test_td_mul_numeric_ndarray_0d(self): td = Timedelta("1 day") - other = np.array(2) + other = np.array(2, dtype=np.int64) assert other.ndim == 0 expected = Timedelta("2 days") diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index a1ed996dade8e..ede38ce9c9a09 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3407,7 +3407,18 @@ def test_invalid_origin(self, unit): with pytest.raises(ValueError, match=msg): to_datetime("2005-01-01", origin="1960-01-01", unit=unit) - def test_epoch(self, units, epochs, epoch_1960, units_from_epochs): + @pytest.mark.parametrize( + "epochs", + [ + Timestamp(1960, 1, 1), + datetime(1960, 1, 1), + "1960-01-01", + np.datetime64("1960-01-01"), + ], + ) + def test_epoch(self, units, epochs): + epoch_1960 = Timestamp(1960, 1, 1) + units_from_epochs = np.arange(5, dtype=np.int64) expected = Series( [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs] ) From 6f97b57dcca00ac8eab52029669da7a3245c6095 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 25 Jun 2024 10:31:18 -0700 Subject: [PATCH 169/396] Backport PR #59094 on branch 2.2.x (BUG: Fix sparse doctests for SciPy 1.14.0) (#59104) Backport PR #59094: BUG: Fix sparse doctests for SciPy 1.14.0 Co-authored-by: Lysandros Nikolaou --- pandas/core/arrays/sparse/accessor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index fc7debb1f31e4..67bb417865475 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -92,8 +92,8 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 2.], @@ -178,8 +178,8 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False) ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 3.], [3., 0., 0., 0.], @@ -350,8 +350,8 @@ def to_coo(self): -------- >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) >>> df.sparse.to_coo() - <4x1 sparse matrix of type '' - with 2 stored elements in COOrdinate format> + """ import_optional_dependency("scipy") from scipy.sparse import coo_matrix From 243457dc1d1533d1ed18cdb36db810a8f1faa06d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 1 Jul 2024 19:38:28 +0200 Subject: [PATCH 170/396] Backport PR #59114 on branch 2.2.x (BUG: Allow show_versions to work for any module that raises an exception) (#59158) Backport PR #59114: BUG: Allow show_versions to work for any module that raises an exception Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/util/_print_versions.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index e39c2f7badb1d..4ede5627c28b9 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -45,7 +45,7 @@ def _get_sys_info() -> dict[str, JSONSerializable]: language_code, encoding = locale.getlocale() return { "commit": _get_commit_hash(), - "python": ".".join([str(i) for i in sys.version_info]), + "python": platform.python_version(), "python-bits": struct.calcsize("P") * 8, "OS": uname_result.system, "OS-release": uname_result.release, @@ -70,33 +70,25 @@ def _get_dependency_info() -> dict[str, JSONSerializable]: "pytz", "dateutil", # install / build, - "setuptools", "pip", "Cython", - # test - "pytest", - "hypothesis", # docs "sphinx", - # Other, need a min version - "blosc", - "feather", - "xlsxwriter", - "lxml.etree", - "html5lib", - "pymysql", - "psycopg2", - "jinja2", # Other, not imported. "IPython", - "pandas_datareader", ] + # Optional dependencies deps.extend(list(VERSIONS)) result: dict[str, JSONSerializable] = {} for modname in deps: - mod = import_optional_dependency(modname, errors="ignore") - result[modname] = get_version(mod) if mod else None + try: + mod = import_optional_dependency(modname, errors="ignore") + except Exception: + # Dependency conflicts may cause a non ImportError + result[modname] = "N/A" + else: + result[modname] = get_version(mod) if mod else None return result From 98ba07a347836ef92a4affc4d366e3fc3c6b0d0c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 3 Jul 2024 22:04:43 +0200 Subject: [PATCH 171/396] Backport PR #59168 on branch 2.2.x (TST: Address UserWarning in matplotlib test) (#59175) Backport PR #59168: TST: Address UserWarning in matplotlib test Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/plotting/_matplotlib/core.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 2979903edf360..52382d9f7d572 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -893,7 +893,13 @@ def _make_legend(self) -> None: elif self.subplots and self.legend: for ax in self.axes: if ax.get_visible(): - ax.legend(loc="best") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "No artists with labels found to put in legend.", + UserWarning, + ) + ax.legend(loc="best") @final @staticmethod From f656d52a8cc192f6fcd15d0f540ccb4ba4cd6eb0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Jul 2024 02:31:35 +0200 Subject: [PATCH 172/396] Backport PR #59306 on branch 2.2.x (CI: xfail test_to_read_gcs for pyarrow=17) (#59308) Backport PR #59306: CI: xfail test_to_read_gcs for pyarrow=17 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/io/test_gcs.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 0ce6a8bf82cd8..4b337b5b82052 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under17p0 + from pandas import ( DataFrame, Index, @@ -52,7 +54,7 @@ def ls(self, path, **kwargs): # Patches pyarrow; other processes should not pick up change @pytest.mark.single_cpu @pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) -def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys): +def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys, request): """ Test that many to/read functions support GCS. @@ -91,6 +93,13 @@ def from_uri(path): to_local = pathlib.Path(path.replace("gs://", "")).absolute().as_uri() return pa_fs.LocalFileSystem(to_local) + request.applymarker( + pytest.mark.xfail( + not pa_version_under17p0, + raises=TypeError, + reason="pyarrow 17 broke the mocked filesystem", + ) + ) with monkeypatch.context() as m: m.setattr(pa_fs, "FileSystem", MockFileSystem) df1.to_parquet(path) From 785880cbe3208b180da31d427b2f006932c0c323 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 5 Aug 2024 10:27:30 -1000 Subject: [PATCH 173/396] Backport PR #59423: CI: Install libegl explicitly for pytest-qt on ubuntu (#59424) --- .circleci/config.yml | 1 + .github/workflows/code-checks.yml | 5 +++++ .github/workflows/docbuild-and-upload.yml | 4 ++++ .github/workflows/unit-tests.yml | 4 ++-- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6f134c9a7a7bd..0748d6550fe2d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -17,6 +17,7 @@ jobs: - run: > PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD + sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ci/run_tests.sh linux-musl: docker: diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 8e29d56f47dcf..f908d1e572ab1 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -51,6 +51,11 @@ jobs: # TODO: The doctests have to be run first right now, since the Cython doctests only work # with pandas installed in non-editable mode # This can be removed once pytest-cython doesn't require C extensions to be installed inplace + + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + - name: Run doctests run: cd ci && ./code_checks.sh doctests if: ${{ steps.build.outcome == 'success' && always() }} diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 73acd9acc129a..e470b181772ed 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -46,6 +46,10 @@ jobs: - name: Build Pandas uses: ./.github/actions/build_pandas + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + - name: Test website run: python -m pytest web/ diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index bacc3d874a60d..c1965fcbd9236 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -159,8 +159,8 @@ jobs: fetch-depth: 0 - name: Extra installs - run: sudo apt-get update && sudo apt-get install -y ${{ matrix.extra_apt }} - if: ${{ matrix.extra_apt }} + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ${{ matrix.extra_apt || ''}} - name: Generate extra locales # These extra locales will be available for locale.setlocale() calls in tests From 795cce2a12b6ff77b998d16fcd3ffd22add0711f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 8 Aug 2024 01:16:06 +0200 Subject: [PATCH 174/396] Backport PR #59441 on branch 2.2.x (COMPAT: Fix numpy 2.1 timedelta * DateOffset) (#59444) Backport PR #59441: COMPAT: Fix numpy 2.1 timedelta * DateOffset Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/arrays/timedeltas.py | 8 ++++++++ pandas/tests/arithmetic/test_timedelta64.py | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index e9260a3ec50a2..d4caec4bfd58a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -468,6 +468,10 @@ def __mul__(self, other) -> Self: if is_scalar(other): # numpy will accept float and int, raise TypeError for others result = self._ndarray * other + if result.dtype.kind != "m": + # numpy >= 2.1 may not raise a TypeError + # and seems to dispatch to others.__rmul__? + raise TypeError(f"Cannot multiply with {type(other).__name__}") freq = None if self.freq is not None and not isna(other): freq = self.freq * other @@ -495,6 +499,10 @@ def __mul__(self, other) -> Self: # numpy will accept float or int dtype, raise TypeError for others result = self._ndarray * other + if result.dtype.kind != "m": + # numpy >= 2.1 may not raise a TypeError + # and seems to dispatch to others.__rmul__? + raise TypeError(f"Cannot multiply with {type(other).__name__}") return type(self)._simple_new(result, dtype=result.dtype) __rmul__ = __mul__ diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 007d1e670e1e0..d02e827d435cf 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1454,7 +1454,13 @@ def test_td64arr_mul_int(self, box_with_array): def test_td64arr_mul_tdlike_scalar_raises(self, two_hours, box_with_array): rng = timedelta_range("1 days", "10 days", name="foo") rng = tm.box_expected(rng, box_with_array) - msg = "argument must be an integer|cannot use operands with types dtype" + msg = "|".join( + [ + "argument must be an integer", + "cannot use operands with types dtype", + "Cannot multiply with", + ] + ) with pytest.raises(TypeError, match=msg): rng * two_hours From 71ad17317aa68670e9425115f92f18d99c58ee0a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 19 Aug 2024 10:23:46 -0700 Subject: [PATCH 175/396] Backport PR #59545 on branch 2.2.x (CI: Fix ci for numpy 2 failures) (#59550) Backport PR #59545: CI: Fix ci for numpy 2 failures Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/plotting/_matplotlib/core.py | 2 +- pandas/tests/io/test_parquet.py | 4 ++++ pandas/tests/plotting/frame/test_frame.py | 11 +++++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 52382d9f7d572..3a1e589c2279b 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -547,7 +547,7 @@ def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes: new_ax.set_yscale("log") elif self.logy == "sym" or self.loglog == "sym": new_ax.set_yscale("symlog") - return new_ax # type: ignore[return-value] + return new_ax @final @cache_readonly diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2874279add3e6..8771793672263 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1195,6 +1195,10 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) + @pytest.mark.xfail( + Version(np.__version__) >= Version("2.0.0"), + reason="fastparquet uses np.float_ in numpy2", + ) def test_bool_with_none(self, fp): df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 45dc612148f40..4ca4067214bbd 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -44,6 +44,7 @@ _check_visible, get_y_axis, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -2487,8 +2488,14 @@ def test_group_subplot_invalid_column_name(self): d = {"a": np.arange(10), "b": np.arange(10)} df = DataFrame(d) - with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"): - df.plot(subplots=[("a", "bad_name")]) + if Version(np.__version__) < Version("2.0.0"): + with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"): + df.plot(subplots=[("a", "bad_name")]) + else: + with pytest.raises( + ValueError, match=r"Column label\(s\) \[np\.str\_\('bad_name'\)\]" + ): + df.plot(subplots=[("a", "bad_name")]) def test_group_subplot_duplicated_column(self): d = {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} From dc47602021ef842bb2933eb0d5cae402b9a18e73 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 20 Aug 2024 14:13:45 -1000 Subject: [PATCH 176/396] Backport PR #59553: CI: Uninstall nomkl & 32 bit Interval tests (#59570) * Backport PR #59553: CI: Uninstall nomkl & 32 bit Interval tests * Update pandas/tests/indexes/interval/test_interval_tree.py * Update pandas/tests/indexes/interval/test_interval_tree.py * Update pandas/tests/indexing/interval/test_interval_new.py * Update pandas/tests/indexing/interval/test_interval_new.py --- .github/actions/build_pandas/action.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 63f687324b0ae..85b44ab24b36d 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -28,6 +28,13 @@ runs: fi shell: bash -el {0} + - name: Uninstall nomkl + run: | + if conda list nomkl | grep nomkl 1>/dev/null; then + conda remove nomkl -y + fi + shell: bash -el {0} + - name: Build Pandas run: | export CFLAGS="$CFLAGS ${{ inputs.cflags_adds }}" From 74312f3d32101df8753e8d894ade89f76dfc8131 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 12 Sep 2024 11:37:41 -0400 Subject: [PATCH 177/396] Backport PR #58218: Revert "CI: Pin blosc to fix pytables" --- ci/deps/actions-310.yaml | 2 -- ci/deps/actions-311-downstream_compat.yaml | 2 -- ci/deps/actions-311.yaml | 2 -- ci/deps/actions-312.yaml | 2 -- ci/deps/actions-39-minimum_versions.yaml | 2 -- ci/deps/actions-39.yaml | 2 -- ci/deps/circle-310-arm64.yaml | 2 -- environment.yml | 2 -- scripts/generate_pip_deps_from_conda.py | 2 +- scripts/validate_min_versions_in_sync.py | 5 +---- 10 files changed, 2 insertions(+), 21 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index ea2336ae78f81..a3e44e6373145 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -24,8 +24,6 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 - # https://github.com/conda-forge/pytables-feedstock/issues/97 - - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 8f84a53b58610..d6bf9ec7843de 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -26,8 +26,6 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 - # https://github.com/conda-forge/pytables-feedstock/issues/97 - - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 51a246ce73a11..95cd1a4d46ef4 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -24,8 +24,6 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 - # https://github.com/conda-forge/pytables-feedstock/issues/97 - - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 7d2b9c39d2fe3..a442ed6feeb5d 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -24,8 +24,6 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 - # https://github.com/conda-forge/pytables-feedstock/issues/97 - - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index cedf4fb9dc867..7067048c4434d 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -27,8 +27,6 @@ dependencies: # optional dependencies - beautifulsoup4=4.11.2 - # https://github.com/conda-forge/pytables-feedstock/issues/97 - - c-blosc2=2.13.2 - blosc=1.21.3 - bottleneck=1.3.6 - fastparquet=2022.12.0 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 85f2a74e849ee..b162a78e7f115 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -24,8 +24,6 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 - # https://github.com/conda-forge/pytables-feedstock/issues/97 - - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index c018ad94e7f30..a19ffd485262d 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -25,8 +25,6 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 - # https://github.com/conda-forge/pytables-feedstock/issues/97 - - c-blosc2=2.13.2 - blosc>=1.21.3 - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/environment.yml b/environment.yml index aef3ce66ff352..30c078051d330 100644 --- a/environment.yml +++ b/environment.yml @@ -27,8 +27,6 @@ dependencies: # optional dependencies - beautifulsoup4>=4.11.2 - # https://github.com/conda-forge/pytables-feedstock/issues/97 - - c-blosc2=2.13.2 - blosc - bottleneck>=1.3.6 - fastparquet>=2022.12.0 diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index bf38d2fa419d1..5fcf09cd073fe 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -23,7 +23,7 @@ import tomli as tomllib import yaml -EXCLUDE = {"python", "c-compiler", "cxx-compiler", "c-blosc2"} +EXCLUDE = {"python", "c-compiler", "cxx-compiler"} REMAP_VERSION = {"tzdata": "2022.7"} CONDA_TO_PIP = { "pytables": "tables", diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index 62a92cdd10ebc..7dd3e96e6ec18 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -36,7 +36,7 @@ SETUP_PATH = pathlib.Path("pyproject.toml").resolve() YAML_PATH = pathlib.Path("ci/deps") ENV_PATH = pathlib.Path("environment.yml") -EXCLUDE_DEPS = {"tzdata", "blosc", "c-blosc2", "pandas-gbq", "pyqt", "pyqt5"} +EXCLUDE_DEPS = {"tzdata", "blosc", "pandas-gbq", "pyqt", "pyqt5"} EXCLUSION_LIST = frozenset(["python=3.8[build=*_pypy]"]) # pandas package is not available # in pre-commit environment @@ -225,9 +225,6 @@ def get_versions_from_ci(content: list[str]) -> tuple[dict[str, str], dict[str, seen_required = True elif "# optional dependencies" in line: seen_optional = True - elif "#" in line: - # just a comment - continue elif "- pip:" in line: continue elif seen_required and line.strip(): From 6925b8ebe5c21e1984bc8ce98db514e7359fdc13 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 3 May 2024 09:09:02 -1000 Subject: [PATCH 178/396] Backport PR #58413: DEPS: Unpin docutils --- doc/source/user_guide/basics.rst | 7 +- doc/source/user_guide/gotchas.rst | 15 +--- doc/source/user_guide/groupby.rst | 77 ++++++++++----------- doc/source/user_guide/indexing.rst | 18 ++--- doc/source/user_guide/io.rst | 69 +++++++++---------- doc/source/user_guide/text.rst | 107 ++++++++++++++--------------- environment.yml | 1 - requirements-dev.txt | 1 - 8 files changed, 137 insertions(+), 158 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index f7d89110e6c8f..2ed446324f6b9 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -160,11 +160,10 @@ Here is a sample (using 100 column x 100,000 row ``DataFrames``): .. csv-table:: :header: "Operation", "0.11.0 (ms)", "Prior Version (ms)", "Ratio to Prior" :widths: 25, 25, 25, 25 - :delim: ; - ``df1 > df2``; 13.32; 125.35; 0.1063 - ``df1 * df2``; 21.71; 36.63; 0.5928 - ``df1 + df2``; 22.04; 36.50; 0.6039 + ``df1 > df2``, 13.32, 125.35, 0.1063 + ``df1 * df2``, 21.71, 36.63, 0.5928 + ``df1 + df2``, 22.04, 36.50, 0.6039 You are highly encouraged to install both libraries. See the section :ref:`Recommended Dependencies ` for more installation info. diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 99c85ac66623d..26eb656357bf6 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -315,19 +315,8 @@ Why not make NumPy like R? Many people have suggested that NumPy should simply emulate the ``NA`` support present in the more domain-specific statistical programming language `R -`__. Part of the reason is the NumPy type hierarchy: - -.. csv-table:: - :header: "Typeclass","Dtypes" - :widths: 30,70 - :delim: | - - ``numpy.floating`` | ``float16, float32, float64, float128`` - ``numpy.integer`` | ``int8, int16, int32, int64`` - ``numpy.unsignedinteger`` | ``uint8, uint16, uint32, uint64`` - ``numpy.object_`` | ``object_`` - ``numpy.bool_`` | ``bool_`` - ``numpy.character`` | ``bytes_, str_`` +`__. Part of the reason is the +`NumPy type hierarchy `__. The R language, by contrast, only has a handful of built-in data types: ``integer``, ``numeric`` (floating-point), ``character``, and diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 11863f8aead31..ea08ffe061244 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -509,29 +509,28 @@ listed below, those with a ``*`` do *not* have an efficient, GroupBy-specific, i .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~.DataFrameGroupBy.any`;Compute whether any of the values in the groups are truthy - :meth:`~.DataFrameGroupBy.all`;Compute whether all of the values in the groups are truthy - :meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups - :meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups - :meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group - :meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group - :meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group - :meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group - :meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group - :meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group - :meth:`~.DataFrameGroupBy.median`;Compute the median of each group - :meth:`~.DataFrameGroupBy.min`;Compute the minimum value in each group - :meth:`~.DataFrameGroupBy.nunique`;Compute the number of unique values in each group - :meth:`~.DataFrameGroupBy.prod`;Compute the product of the values in each group - :meth:`~.DataFrameGroupBy.quantile`;Compute a given quantile of the values in each group - :meth:`~.DataFrameGroupBy.sem`;Compute the standard error of the mean of the values in each group - :meth:`~.DataFrameGroupBy.size`;Compute the number of values in each group - :meth:`~.DataFrameGroupBy.skew` *;Compute the skew of the values in each group - :meth:`~.DataFrameGroupBy.std`;Compute the standard deviation of the values in each group - :meth:`~.DataFrameGroupBy.sum`;Compute the sum of the values in each group - :meth:`~.DataFrameGroupBy.var`;Compute the variance of the values in each group + + :meth:`~.DataFrameGroupBy.any`,Compute whether any of the values in the groups are truthy + :meth:`~.DataFrameGroupBy.all`,Compute whether all of the values in the groups are truthy + :meth:`~.DataFrameGroupBy.count`,Compute the number of non-NA values in the groups + :meth:`~.DataFrameGroupBy.cov` * ,Compute the covariance of the groups + :meth:`~.DataFrameGroupBy.first`,Compute the first occurring value in each group + :meth:`~.DataFrameGroupBy.idxmax`,Compute the index of the maximum value in each group + :meth:`~.DataFrameGroupBy.idxmin`,Compute the index of the minimum value in each group + :meth:`~.DataFrameGroupBy.last`,Compute the last occurring value in each group + :meth:`~.DataFrameGroupBy.max`,Compute the maximum value in each group + :meth:`~.DataFrameGroupBy.mean`,Compute the mean of each group + :meth:`~.DataFrameGroupBy.median`,Compute the median of each group + :meth:`~.DataFrameGroupBy.min`,Compute the minimum value in each group + :meth:`~.DataFrameGroupBy.nunique`,Compute the number of unique values in each group + :meth:`~.DataFrameGroupBy.prod`,Compute the product of the values in each group + :meth:`~.DataFrameGroupBy.quantile`,Compute a given quantile of the values in each group + :meth:`~.DataFrameGroupBy.sem`,Compute the standard error of the mean of the values in each group + :meth:`~.DataFrameGroupBy.size`,Compute the number of values in each group + :meth:`~.DataFrameGroupBy.skew` * ,Compute the skew of the values in each group + :meth:`~.DataFrameGroupBy.std`,Compute the standard deviation of the values in each group + :meth:`~.DataFrameGroupBy.sum`,Compute the sum of the values in each group + :meth:`~.DataFrameGroupBy.var`,Compute the variance of the values in each group Some examples: @@ -835,19 +834,18 @@ The following methods on GroupBy act as transformations. .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~.DataFrameGroupBy.bfill`;Back fill NA values within each group - :meth:`~.DataFrameGroupBy.cumcount`;Compute the cumulative count within each group - :meth:`~.DataFrameGroupBy.cummax`;Compute the cumulative max within each group - :meth:`~.DataFrameGroupBy.cummin`;Compute the cumulative min within each group - :meth:`~.DataFrameGroupBy.cumprod`;Compute the cumulative product within each group - :meth:`~.DataFrameGroupBy.cumsum`;Compute the cumulative sum within each group - :meth:`~.DataFrameGroupBy.diff`;Compute the difference between adjacent values within each group - :meth:`~.DataFrameGroupBy.ffill`;Forward fill NA values within each group - :meth:`~.DataFrameGroupBy.pct_change`;Compute the percent change between adjacent values within each group - :meth:`~.DataFrameGroupBy.rank`;Compute the rank of each value within each group - :meth:`~.DataFrameGroupBy.shift`;Shift values up or down within each group + + :meth:`~.DataFrameGroupBy.bfill`,Back fill NA values within each group + :meth:`~.DataFrameGroupBy.cumcount`,Compute the cumulative count within each group + :meth:`~.DataFrameGroupBy.cummax`,Compute the cumulative max within each group + :meth:`~.DataFrameGroupBy.cummin`,Compute the cumulative min within each group + :meth:`~.DataFrameGroupBy.cumprod`,Compute the cumulative product within each group + :meth:`~.DataFrameGroupBy.cumsum`,Compute the cumulative sum within each group + :meth:`~.DataFrameGroupBy.diff`,Compute the difference between adjacent values within each group + :meth:`~.DataFrameGroupBy.ffill`,Forward fill NA values within each group + :meth:`~.DataFrameGroupBy.pct_change`,Compute the percent change between adjacent values within each group + :meth:`~.DataFrameGroupBy.rank`,Compute the rank of each value within each group + :meth:`~.DataFrameGroupBy.shift`,Shift values up or down within each group In addition, passing any built-in aggregation method as a string to :meth:`~.DataFrameGroupBy.transform` (see the next section) will broadcast the result @@ -1095,11 +1093,10 @@ efficient, GroupBy-specific, implementation. .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - :meth:`~.DataFrameGroupBy.head`;Select the top row(s) of each group - :meth:`~.DataFrameGroupBy.nth`;Select the nth row(s) of each group - :meth:`~.DataFrameGroupBy.tail`;Select the bottom row(s) of each group + :meth:`~.DataFrameGroupBy.head`,Select the top row(s) of each group + :meth:`~.DataFrameGroupBy.nth`,Select the nth row(s) of each group + :meth:`~.DataFrameGroupBy.tail`,Select the bottom row(s) of each group Users can also use transformations along with Boolean indexing to construct complex filtrations within groups. For example, suppose we are given groups of products and diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index ba5a5c7db614b..6c7aa15bfb75d 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -101,13 +101,14 @@ well). Any of the axes accessors may be the null slice ``:``. Axes left out of the specification are assumed to be ``:``, e.g. ``p.loc['a']`` is equivalent to ``p.loc['a', :]``. -.. csv-table:: - :header: "Object Type", "Indexers" - :widths: 30, 50 - :delim: ; - Series; ``s.loc[indexer]`` - DataFrame; ``df.loc[row_indexer,column_indexer]`` +.. ipython:: python + + ser = pd.Series(range(5), index=list("abcde")) + ser.loc[["a", "c", "e"]] + + df = pd.DataFrame(np.arange(25).reshape(5, 5), index=list("abcde"), columns=list("abcde")) + df.loc[["a", "c", "e"], ["b", "d"]] .. _indexing.basics: @@ -123,10 +124,9 @@ indexing pandas objects with ``[]``: .. csv-table:: :header: "Object Type", "Selection", "Return Value Type" :widths: 30, 30, 60 - :delim: ; - Series; ``series[label]``; scalar value - DataFrame; ``frame[colname]``; ``Series`` corresponding to colname + Series, ``series[label]``, scalar value + DataFrame, ``frame[colname]``, ``Series`` corresponding to colname Here we construct a simple time series data set to use for illustrating the indexing functionality: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b3ad23e0d4104..64777eb920d5a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -16,27 +16,26 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like .. csv-table:: :header: "Format Type", "Data Description", "Reader", "Writer" :widths: 30, 100, 60, 60 - :delim: ; - - text;`CSV `__;:ref:`read_csv`;:ref:`to_csv` - text;Fixed-Width Text File;:ref:`read_fwf` - text;`JSON `__;:ref:`read_json`;:ref:`to_json` - text;`HTML `__;:ref:`read_html`;:ref:`to_html` - text;`LaTeX `__;;:ref:`Styler.to_latex` - text;`XML `__;:ref:`read_xml`;:ref:`to_xml` - text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` - binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` - binary;`OpenDocument `__;:ref:`read_excel`; - binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` - binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` - binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` - binary;`ORC Format `__;:ref:`read_orc`;:ref:`to_orc` - binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` - binary;`SAS `__;:ref:`read_sas`; - binary;`SPSS `__;:ref:`read_spss`; - binary;`Python Pickle Format `__;:ref:`read_pickle`;:ref:`to_pickle` - SQL;`SQL `__;:ref:`read_sql`;:ref:`to_sql` - SQL;`Google BigQuery `__;:ref:`read_gbq`;:ref:`to_gbq` + + text,`CSV `__, :ref:`read_csv`, :ref:`to_csv` + text,Fixed-Width Text File, :ref:`read_fwf` , NA + text,`JSON `__, :ref:`read_json`, :ref:`to_json` + text,`HTML `__, :ref:`read_html`, :ref:`to_html` + text,`LaTeX `__, :ref:`Styler.to_latex` , NA + text,`XML `__, :ref:`read_xml`, :ref:`to_xml` + text, Local clipboard, :ref:`read_clipboard`, :ref:`to_clipboard` + binary,`MS Excel `__ , :ref:`read_excel`, :ref:`to_excel` + binary,`OpenDocument `__, :ref:`read_excel`, NA + binary,`HDF5 Format `__, :ref:`read_hdf`, :ref:`to_hdf` + binary,`Feather Format `__, :ref:`read_feather`, :ref:`to_feather` + binary,`Parquet Format `__, :ref:`read_parquet`, :ref:`to_parquet` + binary,`ORC Format `__, :ref:`read_orc`, :ref:`to_orc` + binary,`Stata `__, :ref:`read_stata`, :ref:`to_stata` + binary,`SAS `__, :ref:`read_sas` , NA + binary,`SPSS `__, :ref:`read_spss` , NA + binary,`Python Pickle Format `__, :ref:`read_pickle`, :ref:`to_pickle` + SQL,`SQL `__, :ref:`read_sql`,:ref:`to_sql` + SQL,`Google BigQuery `__;:ref:`read_gbq`;:ref:`to_gbq` :ref:`Here ` is an informal performance comparison for some of these IO methods. @@ -1838,14 +1837,13 @@ with optional parameters: .. csv-table:: :widths: 20, 150 - :delim: ; - ``split``; dict like {index -> [index], columns -> [columns], data -> [values]} - ``records``; list like [{column -> value}, ... , {column -> value}] - ``index``; dict like {index -> {column -> value}} - ``columns``; dict like {column -> {index -> value}} - ``values``; just the values array - ``table``; adhering to the JSON `Table Schema`_ + ``split``, dict like {index -> [index]; columns -> [columns]; data -> [values]} + ``records``, list like [{column -> value}; ... ] + ``index``, dict like {index -> {column -> value}} + ``columns``, dict like {column -> {index -> value}} + ``values``, just the values array + ``table``, adhering to the JSON `Table Schema`_ * ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. * ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. @@ -2033,14 +2031,13 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` .. csv-table:: :widths: 20, 150 - :delim: ; - - ``split``; dict like {index -> [index], columns -> [columns], data -> [values]} - ``records``; list like [{column -> value}, ... , {column -> value}] - ``index``; dict like {index -> {column -> value}} - ``columns``; dict like {column -> {index -> value}} - ``values``; just the values array - ``table``; adhering to the JSON `Table Schema`_ + + ``split``, dict like {index -> [index]; columns -> [columns]; data -> [values]} + ``records``, list like [{column -> value} ...] + ``index``, dict like {index -> {column -> value}} + ``columns``, dict like {column -> {index -> value}} + ``values``, just the values array + ``table``, adhering to the JSON `Table Schema`_ * ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if ``False``, then don't infer dtypes at all, default is True, apply only to the data. diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index cf27fc8385223..ad2690ae395be 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -726,57 +726,56 @@ Method summary .. csv-table:: :header: "Method", "Description" :widths: 20, 80 - :delim: ; - - :meth:`~Series.str.cat`;Concatenate strings - :meth:`~Series.str.split`;Split strings on delimiter - :meth:`~Series.str.rsplit`;Split strings on delimiter working from the end of the string - :meth:`~Series.str.get`;Index into each element (retrieve i-th element) - :meth:`~Series.str.join`;Join strings in each element of the Series with passed separator - :meth:`~Series.str.get_dummies`;Split strings on the delimiter returning DataFrame of dummy variables - :meth:`~Series.str.contains`;Return boolean array if each string contains pattern/regex - :meth:`~Series.str.replace`;Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence - :meth:`~Series.str.removeprefix`;Remove prefix from string, i.e. only remove if string starts with prefix. - :meth:`~Series.str.removesuffix`;Remove suffix from string, i.e. only remove if string ends with suffix. - :meth:`~Series.str.repeat`;Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) - :meth:`~Series.str.pad`;"Add whitespace to left, right, or both sides of strings" - :meth:`~Series.str.center`;Equivalent to ``str.center`` - :meth:`~Series.str.ljust`;Equivalent to ``str.ljust`` - :meth:`~Series.str.rjust`;Equivalent to ``str.rjust`` - :meth:`~Series.str.zfill`;Equivalent to ``str.zfill`` - :meth:`~Series.str.wrap`;Split long strings into lines with length less than a given width - :meth:`~Series.str.slice`;Slice each string in the Series - :meth:`~Series.str.slice_replace`;Replace slice in each string with passed value - :meth:`~Series.str.count`;Count occurrences of pattern - :meth:`~Series.str.startswith`;Equivalent to ``str.startswith(pat)`` for each element - :meth:`~Series.str.endswith`;Equivalent to ``str.endswith(pat)`` for each element - :meth:`~Series.str.findall`;Compute list of all occurrences of pattern/regex for each string - :meth:`~Series.str.match`;"Call ``re.match`` on each element, returning matched groups as list" - :meth:`~Series.str.extract`;"Call ``re.search`` on each element, returning DataFrame with one row for each element and one column for each regex capture group" - :meth:`~Series.str.extractall`;"Call ``re.findall`` on each element, returning DataFrame with one row for each match and one column for each regex capture group" - :meth:`~Series.str.len`;Compute string lengths - :meth:`~Series.str.strip`;Equivalent to ``str.strip`` - :meth:`~Series.str.rstrip`;Equivalent to ``str.rstrip`` - :meth:`~Series.str.lstrip`;Equivalent to ``str.lstrip`` - :meth:`~Series.str.partition`;Equivalent to ``str.partition`` - :meth:`~Series.str.rpartition`;Equivalent to ``str.rpartition`` - :meth:`~Series.str.lower`;Equivalent to ``str.lower`` - :meth:`~Series.str.casefold`;Equivalent to ``str.casefold`` - :meth:`~Series.str.upper`;Equivalent to ``str.upper`` - :meth:`~Series.str.find`;Equivalent to ``str.find`` - :meth:`~Series.str.rfind`;Equivalent to ``str.rfind`` - :meth:`~Series.str.index`;Equivalent to ``str.index`` - :meth:`~Series.str.rindex`;Equivalent to ``str.rindex`` - :meth:`~Series.str.capitalize`;Equivalent to ``str.capitalize`` - :meth:`~Series.str.swapcase`;Equivalent to ``str.swapcase`` - :meth:`~Series.str.normalize`;Return Unicode normal form. Equivalent to ``unicodedata.normalize`` - :meth:`~Series.str.translate`;Equivalent to ``str.translate`` - :meth:`~Series.str.isalnum`;Equivalent to ``str.isalnum`` - :meth:`~Series.str.isalpha`;Equivalent to ``str.isalpha`` - :meth:`~Series.str.isdigit`;Equivalent to ``str.isdigit`` - :meth:`~Series.str.isspace`;Equivalent to ``str.isspace`` - :meth:`~Series.str.islower`;Equivalent to ``str.islower`` - :meth:`~Series.str.isupper`;Equivalent to ``str.isupper`` - :meth:`~Series.str.istitle`;Equivalent to ``str.istitle`` - :meth:`~Series.str.isnumeric`;Equivalent to ``str.isnumeric`` - :meth:`~Series.str.isdecimal`;Equivalent to ``str.isdecimal`` + + :meth:`~Series.str.cat`,Concatenate strings + :meth:`~Series.str.split`,Split strings on delimiter + :meth:`~Series.str.rsplit`,Split strings on delimiter working from the end of the string + :meth:`~Series.str.get`,Index into each element (retrieve i-th element) + :meth:`~Series.str.join`,Join strings in each element of the Series with passed separator + :meth:`~Series.str.get_dummies`,Split strings on the delimiter returning DataFrame of dummy variables + :meth:`~Series.str.contains`,Return boolean array if each string contains pattern/regex + :meth:`~Series.str.replace`,Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence + :meth:`~Series.str.removeprefix`,Remove prefix from string i.e. only remove if string starts with prefix. + :meth:`~Series.str.removesuffix`,Remove suffix from string i.e. only remove if string ends with suffix. + :meth:`~Series.str.repeat`,Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) + :meth:`~Series.str.pad`,Add whitespace to the sides of strings + :meth:`~Series.str.center`,Equivalent to ``str.center`` + :meth:`~Series.str.ljust`,Equivalent to ``str.ljust`` + :meth:`~Series.str.rjust`,Equivalent to ``str.rjust`` + :meth:`~Series.str.zfill`,Equivalent to ``str.zfill`` + :meth:`~Series.str.wrap`,Split long strings into lines with length less than a given width + :meth:`~Series.str.slice`,Slice each string in the Series + :meth:`~Series.str.slice_replace`,Replace slice in each string with passed value + :meth:`~Series.str.count`,Count occurrences of pattern + :meth:`~Series.str.startswith`,Equivalent to ``str.startswith(pat)`` for each element + :meth:`~Series.str.endswith`,Equivalent to ``str.endswith(pat)`` for each element + :meth:`~Series.str.findall`,Compute list of all occurrences of pattern/regex for each string + :meth:`~Series.str.match`,Call ``re.match`` on each element returning matched groups as list + :meth:`~Series.str.extract`,Call ``re.search`` on each element returning DataFrame with one row for each element and one column for each regex capture group + :meth:`~Series.str.extractall`,Call ``re.findall`` on each element returning DataFrame with one row for each match and one column for each regex capture group + :meth:`~Series.str.len`,Compute string lengths + :meth:`~Series.str.strip`,Equivalent to ``str.strip`` + :meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip`` + :meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip`` + :meth:`~Series.str.partition`,Equivalent to ``str.partition`` + :meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition`` + :meth:`~Series.str.lower`,Equivalent to ``str.lower`` + :meth:`~Series.str.casefold`,Equivalent to ``str.casefold`` + :meth:`~Series.str.upper`,Equivalent to ``str.upper`` + :meth:`~Series.str.find`,Equivalent to ``str.find`` + :meth:`~Series.str.rfind`,Equivalent to ``str.rfind`` + :meth:`~Series.str.index`,Equivalent to ``str.index`` + :meth:`~Series.str.rindex`,Equivalent to ``str.rindex`` + :meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize`` + :meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase`` + :meth:`~Series.str.normalize`,Return Unicode normal form. Equivalent to ``unicodedata.normalize`` + :meth:`~Series.str.translate`,Equivalent to ``str.translate`` + :meth:`~Series.str.isalnum`,Equivalent to ``str.isalnum`` + :meth:`~Series.str.isalpha`,Equivalent to ``str.isalpha`` + :meth:`~Series.str.isdigit`,Equivalent to ``str.isdigit`` + :meth:`~Series.str.isspace`,Equivalent to ``str.isspace`` + :meth:`~Series.str.islower`,Equivalent to ``str.islower`` + :meth:`~Series.str.isupper`,Equivalent to ``str.isupper`` + :meth:`~Series.str.istitle`,Equivalent to ``str.istitle`` + :meth:`~Series.str.isnumeric`,Equivalent to ``str.isnumeric`` + :meth:`~Series.str.isdecimal`,Equivalent to ``str.isdecimal`` diff --git a/environment.yml b/environment.yml index 30c078051d330..58eb69ad1f070 100644 --- a/environment.yml +++ b/environment.yml @@ -88,7 +88,6 @@ dependencies: - numpydoc - pydata-sphinx-theme=0.14 - pytest-cython # doctest - - docutils < 0.21 # https://github.com/sphinx-doc/sphinx/issues/12302 - sphinx - sphinx-design - sphinx-copybutton diff --git a/requirements-dev.txt b/requirements-dev.txt index c19ae8ea93bb5..5a63e59e1db88 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -63,7 +63,6 @@ natsort numpydoc pydata-sphinx-theme==0.14 pytest-cython -docutils < 0.21 sphinx sphinx-design sphinx-copybutton From 0ed998233478591eec45fdf298a45da1841f81c7 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 16 Sep 2024 01:33:15 -0700 Subject: [PATCH 179/396] Backport PR #59811 on branch 2.2.x (DOC: add whatsnew for v2.2.3) (#59812) Backport PR #59811: DOC: add whatsnew for v2.2.3 Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.2.3.rst | 36 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 doc/source/whatsnew/v2.2.3.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 34a2845290d5a..09d76d71c6e1b 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.2 .. toctree:: :maxdepth: 2 + v2.2.3 v2.2.2 v2.2.1 v2.2.0 diff --git a/doc/source/whatsnew/v2.2.3.rst b/doc/source/whatsnew/v2.2.3.rst new file mode 100644 index 0000000000000..aa6e241e74b0a --- /dev/null +++ b/doc/source/whatsnew/v2.2.3.rst @@ -0,0 +1,36 @@ +.. _whatsnew_223: + +What's new in 2.2.3 (September XX, 2024) +---------------------------------------- + +These are the changes in pandas 2.2.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.contributors: + +Contributors +~~~~~~~~~~~~ From e5a2067a0289b6867ff03302686cbdcbcf945436 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Sep 2024 22:06:58 +0200 Subject: [PATCH 180/396] Backport PR #59818 on branch 2.2.x (BUG: Remove np._get_promotion_state usage) (#59821) BUG: Remove np._get_promotion_state usage (#59818) (cherry picked from commit 081dcdee8d754af90e307cf2311b06b3d02fae2a) Co-authored-by: Lysandros Nikolaou --- pandas/tests/series/indexing/test_setitem.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 23137f0975fb1..29ad674d1cadf 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -3,10 +3,12 @@ datetime, ) from decimal import Decimal +import os import numpy as np import pytest +from pandas.compat import WASM from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import IndexingError @@ -1443,7 +1445,11 @@ def obj(self): marks=pytest.mark.xfail( ( not np_version_gte1p24 - or (np_version_gte1p24 and np._get_promotion_state() != "weak") + or ( + np_version_gte1p24 + and os.environ.get("NPY_PROMOTION_STATE", "weak") != "weak" + ) + or WASM ), reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", From 4a20adbd7d707f73491b930fe9a51e1607a7e070 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 17 Sep 2024 16:20:25 -0700 Subject: [PATCH 181/396] Backport PR #59813 on branch 2.2.x (CI: Debug failing ARM builds) (#59828) Backport PR #59813: CI: Debug failing ARM builds Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/tests/extension/test_sparse.py | 5 +++++ pandas/tests/series/test_ufunc.py | 5 ++++- pyproject.toml | 12 ++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 4039a5d01f372..2d5989a5b4f1d 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -348,11 +348,16 @@ def test_argmin_argmax_all_na(self, method, data, na_value): self._check_unsupported(data) super().test_argmin_argmax_all_na(method, data, na_value) + @pytest.mark.fails_arm_wheels @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) def test_equals(self, data, na_value, as_series, box): self._check_unsupported(data) super().test_equals(data, na_value, as_series, box) + @pytest.mark.fails_arm_wheels + def test_equals_same_data_different_object(self, data): + super().test_equals_same_data_different_object(data) + @pytest.mark.parametrize( "func, na_action, expected", [ diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 9d13ebf740eab..e03e87a44107f 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -18,7 +18,10 @@ def ufunc(request): return request.param -@pytest.fixture(params=[True, False], ids=["sparse", "dense"]) +@pytest.fixture( + params=[pytest.param(True, marks=pytest.mark.fails_arm_wheels), False], + ids=["sparse", "dense"], +) def sparse(request): return request.param diff --git a/pyproject.toml b/pyproject.toml index db9f055799ab0..6443014843229 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -169,6 +169,14 @@ test-command = """ before-build = "pip install delvewheel numpy==2.0.0rc1" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" +[[tool.cibuildwheel.overrides]] +select = "*-manylinux_aarch64*" +test-command = """ + PANDAS_CI='1' python -c 'import pandas as pd; \ + pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db and not fails_arm_wheels", "-n 2", "--no-strict-data-files"]); \ + pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ + """ + [[tool.cibuildwheel.overrides]] select = "*-musllinux*" before-test = "apk update && apk add musl-locales" @@ -525,6 +533,10 @@ markers = [ "clipboard: mark a pd.read_clipboard test", "arm_slow: mark a test as slow for arm64 architecture", "skip_ubsan: Tests known to fail UBSAN check", + # TODO: someone should investigate this ... + # these tests only fail in the wheel builder and don't fail in regular + # ARM CI + "fails_arm_wheels: Tests that fail in the ARM wheel build only", ] [tool.mypy] From 2127b4207abdbb355dbe32c66cfd50c16ff253b3 Mon Sep 17 00:00:00 2001 From: Ben Greiner Date: Wed, 18 Sep 2024 14:01:40 +0200 Subject: [PATCH 182/396] Backport #59144 on 2.2.x / 2.3.x (remove ops div class to solve #2137) (#59535) * remove core.computation.ops.Div resolves #21374 #58748 * need to preserve order * updating tests * (update whatsnew -- no whatsnew for 2.2.x and 2.3 yet) * solve mypy issue * fixing pytests * better than cast * adding specific test (* Update pandas/tests/frame/test_query_eval.py // Not backported, fails on 2.2) * Update pandas/tests/computation/test_eval.py --------- Co-authored-by: Laurent Mutricy Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/_testing/__init__.py | 1 + pandas/conftest.py | 15 ++++++++ pandas/core/computation/expr.py | 6 +--- pandas/core/computation/ops.py | 49 --------------------------- pandas/tests/computation/test_eval.py | 22 ++++++++---- 5 files changed, 33 insertions(+), 60 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 361998db8e38b..87d419e2db8dd 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -111,6 +111,7 @@ COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] STRING_DTYPES: list[Dtype] = [str, "str", "U"] +COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"] TIMEDELTA64_DTYPES: list[Dtype] = ["timedelta64[ns]", "m8[ns]"] diff --git a/pandas/conftest.py b/pandas/conftest.py index 7c35dfdde90ba..10134c90f8eeb 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1403,6 +1403,21 @@ def complex_dtype(request): return request.param +@pytest.fixture(params=tm.COMPLEX_FLOAT_DTYPES) +def complex_or_float_dtype(request): + """ + Parameterized fixture for complex and numpy float dtypes. + + * complex + * 'complex64' + * 'complex128' + * float + * 'float32' + * 'float64' + """ + return request.param + + @pytest.fixture(params=tm.SIGNED_INT_NUMPY_DTYPES) def any_signed_int_numpy_dtype(request): """ diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index b5861fbaebe9c..d642c37cea129 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -31,7 +31,6 @@ UNARY_OPS_SYMS, BinOp, Constant, - Div, FuncNode, Op, Term, @@ -370,7 +369,7 @@ class BaseExprVisitor(ast.NodeVisitor): "Add", "Sub", "Mult", - None, + "Div", "Pow", "FloorDiv", "Mod", @@ -533,9 +532,6 @@ def visit_BinOp(self, node, **kwargs): left, right = self._maybe_downcast_constants(left, right) return self._maybe_evaluate_binop(op, op_class, left, right) - def visit_Div(self, node, **kwargs): - return lambda lhs, rhs: Div(lhs, rhs) - def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) operand = self.visit(node.operand) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 95ac20ba39edc..d8265456dfced 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -332,31 +332,6 @@ def _not_in(x, y): _binary_ops_dict.update(d) -def _cast_inplace(terms, acceptable_dtypes, dtype) -> None: - """ - Cast an expression inplace. - - Parameters - ---------- - terms : Op - The expression that should cast. - acceptable_dtypes : list of acceptable numpy.dtype - Will not cast if term's dtype in this list. - dtype : str or numpy.dtype - The dtype to cast to. - """ - dt = np.dtype(dtype) - for term in terms: - if term.type in acceptable_dtypes: - continue - - try: - new_value = term.value.astype(dt) - except AttributeError: - new_value = dt.type(term.value) - term.update(new_value) - - def is_term(obj) -> bool: return isinstance(obj, Term) @@ -517,30 +492,6 @@ def isnumeric(dtype) -> bool: return issubclass(np.dtype(dtype).type, np.number) -class Div(BinOp): - """ - Div operator to special case casting. - - Parameters - ---------- - lhs, rhs : Term or Op - The Terms or Ops in the ``/`` expression. - """ - - def __init__(self, lhs, rhs) -> None: - super().__init__("/", lhs, rhs) - - if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): - raise TypeError( - f"unsupported operand type(s) for {self.op}: " - f"'{lhs.return_type}' and '{rhs.return_type}'" - ) - - # do not upcast float32s to float64 un-necessarily - acceptable_dtypes = [np.float32, np.float64] - _cast_inplace(com.flatten(self), acceptable_dtypes, np.float64) - - UNARY_OPS_SYMS = ("+", "-", "~", "not") _unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) _unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 17630f14b08c7..e8fad6b8cbd63 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -747,16 +747,26 @@ class TestTypeCasting: @pytest.mark.parametrize("op", ["+", "-", "*", "**", "/"]) # maybe someday... numexpr has too many upcasting rules now # chain(*(np.core.sctypes[x] for x in ['uint', 'int', 'float'])) - @pytest.mark.parametrize("dt", [np.float32, np.float64]) @pytest.mark.parametrize("left_right", [("df", "3"), ("3", "df")]) - def test_binop_typecasting(self, engine, parser, op, dt, left_right): - df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), dtype=dt) + def test_binop_typecasting( + self, engine, parser, op, complex_or_float_dtype, left_right, request + ): + # GH#21374 + dtype = complex_or_float_dtype + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), dtype=dtype) left, right = left_right s = f"{left} {op} {right}" res = pd.eval(s, engine=engine, parser=parser) - assert df.values.dtype == dt - assert res.values.dtype == dt - tm.assert_frame_equal(res, eval(s)) + if dtype == "complex64" and engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr issue with complex that are upcast " + "to complex 128 " + "https://github.com/pydata/numexpr/issues/492" + ) + request.applymarker(mark) + assert df.values.dtype == dtype + assert res.values.dtype == dtype + tm.assert_frame_equal(res, eval(s), check_exact=False) # ------------------------------------- From f7b63786ace286fa8bd0fee1a75589d41883b6df Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 18 Sep 2024 13:37:02 -0400 Subject: [PATCH 183/396] Assorted backports for 2.2.x (#59785) * Backport PR #59065: ENH: Fix Python 3.13 test failures & enable CI * Remove deprecated plot_date calls (#58484) * Remove deprecated plot_date calls These were deprecated in Matplotlib 3.9. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit c9bc4809528998313a609ab16168ca237bc186b6) * Pick out fastparquet xfails for green CI * pin pytz to fix test_arrays.py * more workflow tweaks for pytz and Python 3.13 * fix typing and tune tests for copy on write * remove WASM stuff * more arm skips * go for green --------- Co-authored-by: Lysandros Nikolaou Co-authored-by: Elliott Sales de Andrade --- .circleci/config.yml | 8 ++++---- .github/workflows/unit-tests.yml | 10 +++++----- .github/workflows/wheels.yml | 6 +++--- ci/deps/actions-310.yaml | 3 ++- ci/deps/actions-311-downstream_compat.yaml | 3 ++- ci/deps/actions-311-numpydev.yaml | 3 ++- ci/deps/actions-311-pyarrownightly.yaml | 3 ++- ci/deps/actions-311.yaml | 3 ++- ci/deps/actions-312.yaml | 3 ++- ci/deps/actions-39.yaml | 3 ++- ci/deps/actions-pypy-39.yaml | 1 + ci/deps/circle-310-arm64.yaml | 3 ++- .../src/vendored/ujson/python/objToJSON.c | 12 ++++++------ pandas/_libs/tslibs/offsets.pyx | 7 ++++++- pandas/io/gbq.py | 6 +++--- pandas/tests/groupby/test_groupby.py | 4 +++- .../indexes/interval/test_interval_tree.py | 1 - pandas/tests/indexes/test_common.py | 1 + .../tests/indexing/interval/test_interval.py | 2 -- .../indexing/interval/test_interval_new.py | 3 --- pandas/tests/io/parser/test_dialect.py | 2 +- pandas/tests/io/test_common.py | 5 ++++- pandas/tests/io/test_parquet.py | 15 +++++---------- pandas/tests/io/xml/test_xml.py | 2 +- pandas/tests/plotting/test_datetimelike.py | 18 ++++++++++++------ .../tests/scalar/timedelta/test_arithmetic.py | 1 + pandas/tests/series/indexing/test_setitem.py | 10 ++++++---- pyproject.toml | 14 +++++--------- 28 files changed, 83 insertions(+), 69 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0748d6550fe2d..9ef3f9e2857a0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -14,10 +14,10 @@ jobs: steps: - checkout - run: .circleci/setup_env.sh - - run: > - PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD + - run: | sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH \ + LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD \ ci/run_tests.sh linux-musl: docker: @@ -35,7 +35,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil "pytz<2024.2" pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir - run: | diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index c1965fcbd9236..ad63908e4682d 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -257,7 +257,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil "pytz<2024.2" pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -295,7 +295,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil "pytz<2024.2" pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir @@ -329,7 +329,7 @@ jobs: # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs # to the corresponding posix/windows-macos/sdist etc. workflows. # Feel free to modify this comment as necessary. - if: false # Uncomment this to freeze the workflow, comment it to unfreeze + # if: false # Uncomment this to freeze the workflow, comment it to unfreeze defaults: run: shell: bash -eou pipefail {0} @@ -361,7 +361,7 @@ jobs: - name: Set up Python Dev Version uses: actions/setup-python@v5 with: - python-version: '3.12-dev' + python-version: '3.13-dev' - name: Build Environment run: | @@ -369,7 +369,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov + python -m pip install python-dateutil "pytz<2024.2" tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" python -m pip list diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 4bd9068e91b67..3d4fbfb995fb3 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -170,13 +170,13 @@ jobs: shell: pwsh run: | $TST_CMD = @" - python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; + python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytz<2024.2; python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; "@ # add rc to the end of the image name if the Python version is unreleased - docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} - docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD + docker pull python:${{ matrix.python[1] == '3.13' && '3.13-rc' || format('{0}-windowsservercore', matrix.python[1]) }} + docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.13' && '3.13-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - uses: actions/upload-artifact@v4 with: diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index a3e44e6373145..d0e788d1b124f 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -20,7 +20,8 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 # optional dependencies - beautifulsoup4>=4.11.2 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index d6bf9ec7843de..7fda383dd9e1d 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -22,7 +22,8 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 # optional dependencies - beautifulsoup4>=4.11.2 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index b62e8630f2059..21791e3a9c2eb 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -21,7 +21,8 @@ dependencies: # pandas dependencies - python-dateutil - - pytz + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 - pip - pip: diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 5455b9b84b034..b90fa2e044cd6 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -19,7 +19,8 @@ dependencies: # required dependencies - python-dateutil - numpy<2 - - pytz + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 - pip - pip: diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 95cd1a4d46ef4..c72d743bf3375 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -20,7 +20,8 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 # optional dependencies - beautifulsoup4>=4.11.2 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index a442ed6feeb5d..032bd68c09ad6 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -20,7 +20,8 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 # optional dependencies - beautifulsoup4>=4.11.2 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index b162a78e7f115..4320e9060fb4a 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -20,7 +20,8 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz + # pytz 2024.2 timezones cause wrong results + - pytz<2024.2 # optional dependencies - beautifulsoup4>=4.11.2 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index d9c8dd81b7c33..bdc07931988d1 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -22,6 +22,7 @@ dependencies: # required - numpy - python-dateutil + # pytz 2024.2 timezones cause wrong results - pytz - pip: - tzdata>=2022.7 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index a19ffd485262d..36c584bf1fd10 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -21,7 +21,8 @@ dependencies: # required dependencies - python-dateutil - numpy - - pytz + # pytz 2024.2 timezones cause wrong results + - pytz < 2024.2 # optional dependencies - beautifulsoup4>=4.11.2 diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index fa91db5fe34e3..5f35860c59cb7 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -410,8 +410,8 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->type_num = PyArray_DESCR(obj)->type_num; if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->dim = PyArray_DIM(obj, (int)npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, (int)npyarr->ndim); npyarr->stridedim = npyarr->ndim; npyarr->index[npyarr->ndim] = 0; npyarr->inc = -1; @@ -452,8 +452,8 @@ static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { return; } const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; - npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->dataptr += npyarr->stride; NpyArr_freeItemValue(obj, tc); @@ -524,8 +524,8 @@ static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { } const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; - npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->index[npyarr->stridedim] = 0; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index c37a4b285daef..5dacd7dd55231 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4960,7 +4960,12 @@ cpdef to_offset(freq, bint is_period=False): if result is None: raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) - if is_period and not hasattr(result, "_period_dtype_code"): + try: + has_period_dtype_code = hasattr(result, "_period_dtype_code") + except ValueError: + has_period_dtype_code = False + + if is_period and not has_period_dtype_code: if isinstance(freq, str): raise ValueError(f"{result.name} is not supported as period frequency") else: diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 350002bf461ff..24e4e0b7cef0a 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -11,7 +11,7 @@ from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: - import google.auth + from google.auth.credentials import Credentials from pandas import DataFrame @@ -37,7 +37,7 @@ def read_gbq( dialect: str | None = None, location: str | None = None, configuration: dict[str, Any] | None = None, - credentials: google.auth.credentials.Credentials | None = None, + credentials: Credentials | None = None, use_bqstorage_api: bool | None = None, max_results: int | None = None, progress_bar_type: str | None = None, @@ -230,7 +230,7 @@ def to_gbq( table_schema: list[dict[str, str]] | None = None, location: str | None = None, progress_bar: bool = True, - credentials: google.auth.credentials.Credentials | None = None, + credentials: Credentials | None = None, ) -> None: warnings.warn( "to_gbq is deprecated and will be removed in a future version. " diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ed9acdd0c9dde..44d6340e55507 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2816,7 +2816,9 @@ def test_rolling_wrong_param_min_period(): test_df = DataFrame([name_l, val_l]).T test_df.columns = ["name", "val"] - result_error_msg = r"__init__\(\) got an unexpected keyword argument 'min_period'" + result_error_msg = ( + r"^[a-zA-Z._]*\(\) got an unexpected keyword argument 'min_period'" + ) with pytest.raises(TypeError, match=result_error_msg): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index 45b25f2533afd..78388e84fc6dc 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -190,7 +190,6 @@ def test_construction_overflow(self): expected = (50 + np.iinfo(np.int64).max) / 2 assert result == expected - @pytest.mark.xfail(not IS64, reason="GH 23440") @pytest.mark.parametrize( "left, right, expected", [ diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 80c39322b9b81..05b2aa584674c 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -452,6 +452,7 @@ def test_sort_values_invalid_na_position(index_with_missing, na_position): index_with_missing.sort_values(na_position=na_position) +@pytest.mark.fails_arm_wheels @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize("na_position", ["first", "last"]) def test_sort_values_with_missing(index_with_missing, na_position, request): diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index cabfee9aa040a..dd51917b85a59 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -2,7 +2,6 @@ import pytest from pandas._libs import index as libindex -from pandas.compat import IS64 import pandas as pd from pandas import ( @@ -210,7 +209,6 @@ def test_mi_intervalindex_slicing_with_scalar(self): expected = Series([1, 6, 2, 8, 7], index=expected_index, name="value") tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not IS64, reason="GH 23440") @pytest.mark.parametrize( "base", [101, 1010], diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 283921a23e368..018db5846f4e2 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import IS64 - from pandas import ( Index, Interval, @@ -211,7 +209,6 @@ def test_loc_getitem_missing_key_error_message( obj.loc[[4, 5, 6]] -@pytest.mark.xfail(not IS64, reason="GH 23440") @pytest.mark.parametrize( "intervals", [ diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index 7a72e66996d43..803114723bc74 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -26,7 +26,7 @@ def custom_dialect(): "escapechar": "~", "delimiter": ":", "skipinitialspace": False, - "quotechar": "~", + "quotechar": "`", "quoting": 3, } return dialect_name, dialect_kwargs diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 074033868635a..e51f86563081b 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -485,7 +485,10 @@ def test_warning_missing_utf_bom(self, encoding, compression_): df.to_csv(path, compression=compression_, encoding=encoding) # reading should fail (otherwise we wouldn't need the warning) - msg = r"UTF-\d+ stream does not start with BOM" + msg = ( + r"UTF-\d+ stream does not start with BOM|" + r"'utf-\d+' codec can't decode byte" + ) with pytest.raises(UnicodeError, match=msg): pd.read_csv(path, compression=compression_, encoding=encoding) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 8771793672263..760a64c8d4c33 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -16,7 +16,6 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, - pa_version_under17p0, ) import pandas as pd @@ -449,12 +448,8 @@ def test_read_filters(self, engine, tmp_path): repeat=1, ) - def test_write_index(self, engine, using_copy_on_write, request): + def test_write_index(self, engine): check_names = engine != "fastparquet" - if using_copy_on_write and engine == "fastparquet": - request.applymarker( - pytest.mark.xfail(reason="fastparquet write into index") - ) df = pd.DataFrame({"A": [1, 2, 3]}) check_round_trip(df, engine) @@ -1064,9 +1059,6 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) - @pytest.mark.xfail( - pa_version_under17p0, reason="pa.pandas_compat passes 'datetime64' to .astype" - ) def test_columns_dtypes_not_invalid(self, pa): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) @@ -1314,7 +1306,10 @@ def test_empty_dataframe(self, fp): expected = df.copy() check_round_trip(df, fp, expected=expected) - @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") + @pytest.mark.xfail( + _HAVE_FASTPARQUET and Version(fastparquet.__version__) > Version("2022.12"), + reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929", + ) def test_timezone_aware_index(self, fp, timezone_aware_date_list): idx = 5 * [timezone_aware_date_list] diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 6f429c1ecbf8a..900734e9f0fdf 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1044,7 +1044,7 @@ def test_utf16_encoding(xml_baby_names, parser): UnicodeError, match=( "UTF-16 stream does not start with BOM|" - "'utf-16-le' codec can't decode byte" + "'utf-16(-le)?' codec can't decode byte" ), ): read_xml(xml_baby_names, encoding="UTF-16", parser=parser) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 112172656b6ec..6c318402ea226 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1451,13 +1451,19 @@ def test_mpl_nopandas(self): values1 = np.arange(10.0, 11.0, 0.5) values2 = np.arange(11.0, 12.0, 0.5) - kw = {"fmt": "-", "lw": 4} - _, ax = mpl.pyplot.subplots() - ax.plot_date([x.toordinal() for x in dates], values1, **kw) - ax.plot_date([x.toordinal() for x in dates], values2, **kw) - - line1, line2 = ax.get_lines() + ( + line1, + line2, + ) = ax.plot( + [x.toordinal() for x in dates], + values1, + "-", + [x.toordinal() for x in dates], + values2, + "-", + linewidth=4, + ) exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp) diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 4fc59880c49dd..a4d846f068d00 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -622,6 +622,7 @@ def test_td_floordiv_invalid_scalar(self): [ r"Invalid dtype datetime64\[D\] for __floordiv__", "'dtype' is an invalid keyword argument for this function", + "this function got an unexpected keyword argument 'dtype'", r"ufunc '?floor_divide'? cannot use operands with types", ] ) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 29ad674d1cadf..ed681563f6fcd 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -8,8 +8,10 @@ import numpy as np import pytest -from pandas.compat import WASM -from pandas.compat.numpy import np_version_gte1p24 +from pandas.compat.numpy import ( + np_version_gt2, + np_version_gte1p24, +) from pandas.errors import IndexingError from pandas.core.dtypes.common import is_list_like @@ -1447,9 +1449,9 @@ def obj(self): not np_version_gte1p24 or ( np_version_gte1p24 - and os.environ.get("NPY_PROMOTION_STATE", "weak") != "weak" + and not np_version_gt2 + and os.environ.get("NPY_PROMOTION_STATE", "legacy") != "weak" ) - or WASM ), reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", diff --git a/pyproject.toml b/pyproject.toml index 6443014843229..571c086d2220b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,9 +6,9 @@ requires = [ "meson==1.2.1", "wheel", "Cython==3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json - # Force numpy higher than 2.0rc1, so that built wheels are compatible + # Force numpy higher than 2.0, so that built wheels are compatible # with both numpy 1 and 2 - "numpy>=2.0.0rc1", + "numpy>=2.0", "versioneer[toml]" ] @@ -153,10 +153,8 @@ setup = ['--vsenv'] # For Windows skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} -# TODO: remove this once numpy 2.0 proper releases -# and specify numpy 2.0 as a dependency in [build-system] requires in pyproject.toml -before-build = "pip install numpy==2.0.0rc1" -test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" +# pytz 2024.2 causing some failures +test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytz<2024.2" test-command = """ PANDAS_CI='1' python -c 'import pandas as pd; \ pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \ @@ -164,9 +162,7 @@ test-command = """ """ [tool.cibuildwheel.windows] -# TODO: remove this once numpy 2.0 proper releases -# and specify numpy 2.0 as a dependency in [build-system] requires in pyproject.toml -before-build = "pip install delvewheel numpy==2.0.0rc1" +before-build = "pip install delvewheel" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" [[tool.cibuildwheel.overrides]] From 8d67e77d6aa1b13611c27a63b87f5912fe938f85 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 18 Sep 2024 14:43:19 -0700 Subject: [PATCH 184/396] Backport PR #59836 on branch 2.2.x (BLD: Fix bad Cython annotation) (#59837) Backport PR #59836: BLD: Fix bad Cython annotation Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/_libs/tslibs/np_datetime.pxd | 2 +- pandas/_libs/tslibs/np_datetime.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index cb2658d343772..a8ac80a2d0f39 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -89,7 +89,7 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - format: str | None = *, + str format = *, bint exact = * ) except? -1 diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index aa01a05d0d932..779d1e3111932 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -331,7 +331,7 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - format: str | None=None, + str format=None, bint exact=True, ) except? -1: cdef: From 0bd98feb952d678dcd6da090529c7457db11ca1b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 19 Sep 2024 02:02:31 +0200 Subject: [PATCH 185/396] Backport PR #59136 on branch 2.2.x (Upload 3.13 & free-threaded nightly wheels) (#59835) * Bump pypa/cibuildwheel from 2.19.1 to 2.19.2 (#59208) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.19.1 to 2.19.2. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.19.1...v2.19.2) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> (cherry picked from commit ad09dc6108896e175979c247cff2878d259acf3d) * Upload 3.13 & free-threaded nightly wheels (#59136) * Upload free-threaded nightly wheels on Linux and macOS * Consolidate jobs into one * Install build dependencies in before-build and pass --no-build-isolation * Fix {project} placeholder in cibuildwheel config * Correctly quote echo CIBW_BUILD_FRONTEND command * Run echo -e * Add {package} to before-build * Include cibw script in sdist & add matrix value for build frontend * Change manifest and gitattributes * Change gitattributes * Install verioneer in before-build * Add cibw_before_test to install nightly NumPy * Expand before-test to musl * Better comments plus always run before-build/before-test on 3.13 * Add --no-build-isolation in 3.13 as well * Install nightly numpy before windows tests * Address feedback; add todo for NumPy nightly and move default outside matrix * Set build_frontend to 'build' in pyodide build --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> (cherry picked from commit 7c0ee27e6c00e9645154583917de0f385190d8d8) * CI: Update to cibuildwheel 2.20.0 (#59401) cibuildwheel 2.20.0 uses the ABI stable Python 3.13.0rc1 and build Python 3.13 wheels by default, which allows removing the `CIBW_PRERELEASE_PYTHONS` flag. Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> (cherry picked from commit 70bb855cbbc75b52adcb127c84e0a35d2cd796a9) * Update wheels.yml * BLD/RLS: build wheels with released numpy/cython for Python 3.13 (#59819) (cherry picked from commit 22372175e04f05f73521cab1b26f0818d6766717) * enable prerelease again --------- Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Lysandros Nikolaou Co-authored-by: Ewout ter Hoeven Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .circleci/config.yml | 2 +- .gitattributes | 5 ++++- .github/workflows/wheels.yml | 15 +++++++++++++-- MANIFEST.in | 3 +++ pyproject.toml | 4 +++- scripts/cibw_before_build.sh | 7 +++++++ 6 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 scripts/cibw_before_build.sh diff --git a/.circleci/config.yml b/.circleci/config.yml index 9ef3f9e2857a0..bab5491088089 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -72,7 +72,7 @@ jobs: name: Build aarch64 wheels no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | - pip3 install cibuildwheel==2.15.0 + pip3 install cibuildwheel==2.20.0 cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: diff --git a/.gitattributes b/.gitattributes index 19c6fd2fd1d47..2655d0d018d4f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -68,7 +68,7 @@ ci export-ignore doc export-ignore gitpod export-ignore MANIFEST.in export-ignore -scripts export-ignore +scripts/** export-ignore typings export-ignore web export-ignore CITATION.cff export-ignore @@ -82,3 +82,6 @@ setup.py export-ignore # csv_dir_path fixture checks the existence of the directory # exclude the whole directory to avoid running related tests in sdist pandas/tests/io/parser/data export-ignore + +# Include cibw script in sdist since it's needed for building wheels +scripts/cibw_before_build.sh -export-ignore diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 3d4fbfb995fb3..41417622c3ef2 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -99,7 +99,17 @@ jobs: - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] + python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] + include: + # TODO: Remove this plus installing build deps in cibw_before_build.sh + # after pandas can be built with a released NumPy/Cython + - python: ["cp313t", "3.13"] + cibw_build_frontend: 'pip; args: --no-build-isolation' + # TODO: Build free-threaded wheels for Windows + exclude: + - buildplat: [windows-2022, win_amd64] + python: ["cp313t", "3.13"] + env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -140,12 +150,13 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + CIBW_BUILD_FRONTEND: ${{ matrix.cibw_build_frontend || 'pip' }} - name: Set up Python uses: mamba-org/setup-micromamba@v1 diff --git a/MANIFEST.in b/MANIFEST.in index 9894381ed6252..a7d7d7eb4e062 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -62,3 +62,6 @@ prune pandas/tests/io/parser/data # Selectively re-add *.cxx files that were excluded above graft pandas/_libs/src graft pandas/_libs/include + +# Include cibw script in sdist since it's needed for building wheels +include scripts/cibw_before_build.sh diff --git a/pyproject.toml b/pyproject.toml index 571c086d2220b..2a8e63caaf37a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -160,9 +160,11 @@ test-command = """ pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ +free-threaded-support = true +before-build = "bash {package}/scripts/cibw_before_build.sh" [tool.cibuildwheel.windows] -before-build = "pip install delvewheel" +before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" [[tool.cibuildwheel.overrides]] diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh new file mode 100644 index 0000000000000..6186340807f8f --- /dev/null +++ b/scripts/cibw_before_build.sh @@ -0,0 +1,7 @@ +# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13. +FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" +if [[ $FREE_THREADED_BUILD == "True" ]]; then + python -m pip install -U pip + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython + python -m pip install ninja meson-python versioneer[toml] +fi From 69587385668f0ce61c7fbfc7946a187f8835b194 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 19 Sep 2024 16:41:21 -0400 Subject: [PATCH 186/396] Backport PR #59840: BLD: Final release prep for 2.2.3 (#59842) --- doc/source/conf.py | 4 +++- doc/source/whatsnew/v2.2.2.rst | 2 +- doc/source/whatsnew/v2.2.3.rst | 23 ++++++++++++++++------- pyproject.toml | 2 +- scripts/cibw_before_build.sh | 5 +++++ 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index be6150d4e54ba..3f3241f81af59 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -254,7 +254,9 @@ "json_url": "https://pandas.pydata.org/versions.json", "version_match": switcher_version, }, - "show_version_warning_banner": True, + # This shows a warning for patch releases since the + # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be) + "show_version_warning_banner": False, "icon_links": [ { "name": "Mastodon", diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 72a2f84c4aaee..fbe5e9b4febb5 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -56,4 +56,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.2.1..v2.2.2|HEAD +.. contributors:: v2.2.1..v2.2.2 diff --git a/doc/source/whatsnew/v2.2.3.rst b/doc/source/whatsnew/v2.2.3.rst index aa6e241e74b0a..1696a7b6449af 100644 --- a/doc/source/whatsnew/v2.2.3.rst +++ b/doc/source/whatsnew/v2.2.3.rst @@ -1,6 +1,6 @@ .. _whatsnew_223: -What's new in 2.2.3 (September XX, 2024) +What's new in 2.2.3 (September 20, 2024) ---------------------------------------- These are the changes in pandas 2.2.3. See :ref:`release` for a full changelog @@ -9,28 +9,37 @@ including other versions of pandas. {{ header }} .. --------------------------------------------------------------------------- -.. _whatsnew_223.regressions: -Fixed regressions -~~~~~~~~~~~~~~~~~ -- +.. _whatsnew_220.py13_compat: + +Pandas 2.2.3 is now compatible with Python 3.13 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 2.2.3 is the first version of pandas that is generally compatible with the upcoming +Python 3.13, and both wheels for free-threaded and normal Python 3.13 will be uploaded for +this release. + +As usual please report any bugs discovered to our `issue tracker `_ .. --------------------------------------------------------------------------- .. _whatsnew_223.bug_fixes: Bug fixes ~~~~~~~~~ -- +- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`) +- Minor fixes for numpy 2.1 compatibility. (:issue:`59444`) .. --------------------------------------------------------------------------- .. _whatsnew_223.other: Other ~~~~~ -- +- Missing licenses for 3rd party dependencies were added back into the wheels. (:issue:`58632`) .. --------------------------------------------------------------------------- .. _whatsnew_223.contributors: Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.2.2..v2.2.3|HEAD diff --git a/pyproject.toml b/pyproject.toml index 2a8e63caaf37a..18a88cd0a1f38 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -161,7 +161,7 @@ test-command = """ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ free-threaded-support = true -before-build = "bash {package}/scripts/cibw_before_build.sh" +before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh" [tool.cibuildwheel.windows] before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh" diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh index 6186340807f8f..679b91e3280ec 100644 --- a/scripts/cibw_before_build.sh +++ b/scripts/cibw_before_build.sh @@ -1,3 +1,8 @@ +# Add 3rd party licenses, like numpy does +for file in $PACKAGE_DIR/LICENSES/*; do + cat $file >> $PACKAGE_DIR/LICENSE +done + # TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13. FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" if [[ $FREE_THREADED_BUILD == "True" ]]; then From f108468a42932476754b359f33197da9faa06cd6 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 20 Sep 2024 06:59:08 -0400 Subject: [PATCH 187/396] RLS: 2.2.3 From 6891e90c4ed2a5c9843acbdb26a295faf1bfe386 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 20 Sep 2024 07:39:20 -0400 Subject: [PATCH 188/396] Backport PR #59847: BLD: Build wheels for Python 3.13 on aarch64 as well --- .circleci/config.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bab5491088089..50ff7a81ae103 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -73,7 +73,13 @@ jobs: no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | pip3 install cibuildwheel==2.20.0 - cibuildwheel --prerelease-pythons --output-dir wheelhouse + if [[ $CIBW_BUILD == cp313t* ]]; then + # TODO: temporarily run 3.13 free threaded builds without build isolation + # since we need pre-release cython + CIBW_BUILD_FRONTEND="pip; args: --no-build-isolation" cibuildwheel --prerelease-pythons --output-dir wheelhouse + else + cibuildwheel --prerelease-pythons --output-dir wheelhouse + fi environment: CIBW_BUILD: << parameters.cibw-build >> @@ -128,7 +134,11 @@ workflows: "cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64", + "cp313-manylinux_aarch64", + "cp313t-manylinux_aarch64", "cp39-musllinux_aarch64", "cp310-musllinux_aarch64", "cp311-musllinux_aarch64", - "cp312-musllinux_aarch64",] + "cp312-musllinux_aarch64", + "cp313-musllinux_aarch64", + "cp313t-musllinux_aarch64"] From 658dfddaec7548151db4c832a8472d732b1afec9 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 20 Sep 2024 07:51:33 -0400 Subject: [PATCH 189/396] relax cython bound --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 18a88cd0a1f38..238abd85dcdb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "meson-python==0.13.1", "meson==1.2.1", "wheel", - "Cython==3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json + "Cython~=3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json # Force numpy higher than 2.0, so that built wheels are compatible # with both numpy 1 and 2 "numpy>=2.0", From 0691c5cf90477d3503834d983f69350f250a6ff7 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 20 Sep 2024 08:21:50 -0400 Subject: [PATCH 190/396] RLS: 2.2.3 From 0eb547b650d8eaa2b994fd7164e10121efb17b26 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 12:51:03 +0200 Subject: [PATCH 191/396] Start 2.3.0 From 9440c866d87ae6fb247a2f3141154ce792fa920b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 2 Oct 2024 04:00:17 -0700 Subject: [PATCH 192/396] Backport PR #59939 on branch 2.3.x (CI: Run jobs on 2.3.x branch) (#59940) Backport PR #59939: CI: Run jobs on 2.3.x branch Co-authored-by: Joris Van den Bossche --- .github/workflows/code-checks.yml | 4 ++-- .github/workflows/docbuild-and-upload.yml | 4 ++-- .github/workflows/package-checks.yml | 4 ++-- .github/workflows/unit-tests.yml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index f908d1e572ab1..dacf740e5d4d8 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.2.x + - 2.3.x pull_request: branches: - main - - 2.2.x + - 2.3.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index e470b181772ed..3abe9c92bcefa 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -4,13 +4,13 @@ on: push: branches: - main - - 2.2.x + - 2.3.x tags: - '*' pull_request: branches: - main - - 2.2.x + - 2.3.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 7c1da5678a2aa..e1be5659bbd9a 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.2.x + - 2.3.x pull_request: branches: - main - - 2.2.x + - 2.3.x types: [ labeled, opened, synchronize, reopened ] permissions: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index ad63908e4682d..b54fe8f044c78 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.2.x + - 2.3.x pull_request: branches: - main - - 2.2.x + - 2.3.x paths-ignore: - "doc/**" - "web/**" From 7e1642019dda30b78a36d5da3b77ec18bbd5765f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 2 Oct 2024 05:41:25 -0700 Subject: [PATCH 193/396] Backport PR #59912 on branch 2.3.x (CI: Pin micromamba to 1.x) (#59936) Backport PR #59912: CI: Pin micromamba to 1.x Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Co-authored-by: Joris Van den Bossche --- .github/actions/setup-conda/action.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index ceeebfcd1c90c..e31fed120267a 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -9,6 +9,8 @@ runs: - name: Install ${{ inputs.environment-file }} uses: mamba-org/setup-micromamba@v1 with: + # Pinning to avoid 2.0 failures + micromamba-version: '1.5.10-0' environment-file: ${{ inputs.environment-file }} environment-name: test condarc-file: ci/.condarc From 24510bd82d519404a4a8138df207ce100add219d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 23:23:48 +0200 Subject: [PATCH 194/396] Backport PR #59807: BUG (CoW): fix reference tracking in replace_list with None (#59943) * BUG (CoW): fix reference tracking in replace_list with None (#59807) (cherry picked from commit 3e8ac12d1dacc2308b2f4c2869fa7bc2079bd323) * correct test to work without CoW --- pandas/core/internals/blocks.py | 2 +- pandas/tests/copy_view/test_replace.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 259e969112dd7..2f448bf249a2e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1218,7 +1218,7 @@ def _replace_coerce( putmask_inplace(nb.values, mask, value) return [nb] if using_cow: - return [self] + return [self.copy(deep=False)] return [self] if inplace else [self.copy()] return self.replace( to_replace=to_replace, diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 6d16bc3083883..0beac439fbb58 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -384,6 +384,15 @@ def test_replace_list_none(using_copy_on_write): assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + # replace multiple values that don't actually replace anything with None + # https://github.com/pandas-dev/pandas/issues/59770 + df3 = df.replace(["d", "e", "f"], value=None) + tm.assert_frame_equal(df3, df_orig) + if using_copy_on_write: + assert tm.shares_memory(get_array(df, "a"), get_array(df3, "a")) + else: + assert not tm.shares_memory(get_array(df, "a"), get_array(df3, "a")) + def test_replace_list_none_inplace_refs(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}) From 6e5ccd86bc660ffa04476d25ac06cd5a93b52717 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Oct 2024 09:09:31 +0200 Subject: [PATCH 195/396] Backport PR #59893 (CI/TST: Check for tzset in set_timezone) (#59941) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_testing/contexts.py | 17 +++++++++-------- pandas/tests/tslibs/test_parsing.py | 3 ++- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index eb6e4a917889a..48616ee134582 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -78,14 +78,15 @@ def set_timezone(tz: str) -> Generator[None, None, None]: import time def setTZ(tz) -> None: - if tz is None: - try: - del os.environ["TZ"] - except KeyError: - pass - else: - os.environ["TZ"] = tz - time.tzset() + if hasattr(time, "tzset"): + if tz is None: + try: + del os.environ["TZ"] + except KeyError: + pass + else: + os.environ["TZ"] = tz + time.tzset() orig_tz = os.environ.get("TZ") setTZ(tz) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index d8f23156bd4d4..fb05a57056a83 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -17,6 +17,7 @@ from pandas._libs.tslibs.parsing import parse_datetime_string_with_reso from pandas.compat import ( ISMUSL, + is_platform_arm, is_platform_windows, ) import pandas.util._test_decorators as td @@ -26,7 +27,7 @@ @pytest.mark.skipif( - is_platform_windows() or ISMUSL, + is_platform_windows() or ISMUSL or is_platform_arm(), reason="TZ setting incorrect on Windows and MUSL Linux", ) def test_parsing_tzlocal_deprecated(): From b71c59987619b38f9547ff02aa92d80e69afe792 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Jul 2024 18:08:31 +0200 Subject: [PATCH 196/396] PDEP-14: Dedicated string data type for pandas 3.0 (#58551) Co-authored-by: Simon Hawkins Co-authored-by: Irv Lustig Co-authored-by: William Ayd Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- web/pandas/pdeps/0014-string-dtype.md | 375 ++++++++++++++++++++++++++ 1 file changed, 375 insertions(+) create mode 100644 web/pandas/pdeps/0014-string-dtype.md diff --git a/web/pandas/pdeps/0014-string-dtype.md b/web/pandas/pdeps/0014-string-dtype.md new file mode 100644 index 0000000000000..5b74f71216454 --- /dev/null +++ b/web/pandas/pdeps/0014-string-dtype.md @@ -0,0 +1,375 @@ +# PDEP-14: Dedicated string data type for pandas 3.0 + +- Created: May 3, 2024 +- Status: Accepted +- Discussion: https://github.com/pandas-dev/pandas/pull/58551 +- Author: [Joris Van den Bossche](https://github.com/jorisvandenbossche) +- Revision: 1 + +## Abstract + +This PDEP proposes to introduce a dedicated string dtype that will be used by +default in pandas 3.0: + +* In pandas 3.0, enable a string dtype (`"str"`) by default, using PyArrow if available + or otherwise a string dtype using numpy object-dtype under the hood as fallback. +* The default string dtype will use missing value semantics (using NaN) consistent + with the other default data types. + +This will give users a long-awaited proper string dtype for 3.0, while 1) not +(yet) making PyArrow a _hard_ dependency, but only a dependency used by default, +and 2) leaving room for future improvements (different missing value semantics, +using NumPy 2.0 strings, etc). + +## Background + +Currently, pandas by default stores text data in an `object`-dtype NumPy array. +The current implementation has two primary drawbacks. First, `object` dtype is +not specific to strings: any Python object can be stored in an `object`-dtype +array, not just strings, and seeing `object` as the dtype for a column with +strings is confusing for users. Second: this is not efficient (all string +methods on a Series are eventually calling Python methods on the individual +string objects). + +To solve the first issue, a dedicated extension dtype for string data has +already been +[added in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#dedicated-string-data-type). +This has always been opt-in for now, requiring users to explicitly request the +dtype (with `dtype="string"` or `dtype=pd.StringDtype()`). The array backing +this string dtype was initially almost the same as the default implementation, +i.e. an `object`-dtype NumPy array of Python strings. + +To solve the second issue (performance), pandas contributed to the development +of string kernels in the PyArrow package, and a variant of the string dtype +backed by PyArrow was +[added in pandas 1.3](https://pandas.pydata.org/docs/whatsnew/v1.3.0.html#pyarrow-backed-string-data-type). +This could be specified with the `storage` keyword in the opt-in string dtype +(`pd.StringDtype(storage="pyarrow")`). + +Since its introduction, the `StringDtype` has always been opt-in, and has used +the experimental `pd.NA` sentinel for missing values (which was also [introduced +in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#experimental-na-scalar-to-denote-missing-values)). +However, up to this date, pandas has not yet taken the step to use `pd.NA` for +for any default dtype, and thus the `StringDtype` deviates in missing value +behaviour compared to the default data types. + +In 2023, [PDEP-10](https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html) +proposed to start using a PyArrow-backed string dtype by default in pandas 3.0 +(i.e. infer this type for string data instead of object dtype). To ensure we +could use the variant of `StringDtype` backed by PyArrow instead of Python +objects (for better performance), it proposed to make `pyarrow` a new required +runtime dependency of pandas. + +In the meantime, NumPy has also been working on a native variable-width string +data type, which was made available [starting with NumPy +2.0](https://numpy.org/devdocs/release/2.0.0-notes.html#stringdtype-has-been-added-to-numpy). +This can provide a potential alternative to PyArrow for implementing a string +data type in pandas that is not backed by Python objects. + +After acceptance of PDEP-10, two aspects of the proposal have been under +reconsideration: + +- Based on feedback from users and maintainers from other packages (mostly + around installation complexity and size), it has been considered to relax the + new `pyarrow` requirement to not be a _hard_ runtime dependency. In addition, + NumPy 2.0 could in the future potentially reduce the need to make PyArrow a + required dependency specifically for a dedicated pandas string dtype. +- PDEP-10 did not consider the usage of the experimental `pd.NA` as a + consequence of adopting one of the existing implementations of the + `StringDtype`. + +For the second aspect, another variant of the `StringDtype` was +[introduced in pandas 2.1](https://pandas.pydata.org/docs/whatsnew/v2.1.0.html#whatsnew-210-enhancements-infer-strings) +that is still backed by PyArrow but follows the default missing values semantics +pandas uses for all other default data types (and using `NaN` as the missing +value sentinel) ([GH-54792](https://github.com/pandas-dev/pandas/issues/54792)). +At the time, the `storage` option for this new variant was called +`"pyarrow_numpy"` to disambiguate from the existing `"pyarrow"` option using +`pd.NA` (but this PDEP proposes a better naming scheme, see the "Naming" +subsection below). + +This last dtype variant is what users currently (pandas 2.2) get for string data +when enabling the ``future.infer_string`` option (to enable the behaviour which +is intended to become the default in pandas 3.0). + +## Proposal + +To be able to move forward with a string data type in pandas 3.0, this PDEP proposes: + +1. For pandas 3.0, a `"str"` string dtype is enabled by default, i.e. this + string dtype will be used as the default dtype for text data when creating + pandas objects (e.g. inference in constructors, I/O functions). +2. This default string dtype will follow the same behaviour for missing values + as other default data types, and use `NaN` as the missing value sentinel. +3. The string dtype will use PyArrow if installed, and otherwise falls back to + an in-house functionally-equivalent (but slower) version. This fallback can + reuse (with minor code additions) the existing numpy object-dtype backed + StringArray for its implementation. +4. Installation guidelines are updated to clearly encourage users to install + pyarrow for the default user experience. + +Those string dtypes enabled by default will then no longer be considered as +experimental. + +### Default inference of a string dtype + +By default, pandas will infer this new string dtype instead of object dtype for +string data (when creating pandas objects, such as in constructors or IO +functions). + +In pandas 2.2, the existing `future.infer_string` option can be used to opt-in to the future +default behaviour: + +```python +>>> pd.options.future.infer_string = True +>>> pd.Series(["a", "b", None]) +0 a +1 b +2 NaN +dtype: string +``` + +Right now (pandas 2.2), the existing option only enables the PyArrow-based +future dtype. For the remaining 2.x releases, this option will be expanded to +also work when PyArrow is not installed to enable the object-dtype fallback in +that case. + +### Missing value semantics + +As mentioned in the background section, the original `StringDtype` has always +used the experimental `pd.NA` sentinel for missing values. In addition to using +`pd.NA` as the scalar for a missing value, this essentially means that: + +- String columns follow ["NA-semantics"](https://pandas.pydata.org/docs/user_guide/missing_data.html#na-semantics) + for missing values, where `NA` propagates in boolean operations such as + comparisons or predicates. +- Operations on the string column that give a numeric or boolean result use the + nullable Integer/Float/Boolean data types (e.g. `ser.str.len()` returns the + nullable `"Int64"` / `pd.Int64Dtype()` dtype instead of the numpy `int64` + dtype (or `float64` in case of missing values)). + +However, up to this date, all other default data types still use `NaN` semantics +for missing values. Therefore, this proposal says that a new default string +dtype should also still use the same default missing value semantics and return +default data types when doing operations on the string column, to be consistent +with the other default dtypes at this point. + +In practice, this means that the default string dtype will use `NaN` as +the missing value sentinel, and: + +- String columns will follow NaN-semantics for missing values, where `NaN` gives + False in boolean operations such as comparisons or predicates. +- Operations on the string column that give a numeric or boolean result will use + the default data types (i.e. numpy `int64`/`float64`/`bool`). + +Because the original `StringDtype` implementations already use `pd.NA` and +return masked integer and boolean arrays in operations, a new variant of the +existing dtypes that uses `NaN` and default data types was needed. The original +variant of `StringDtype` using `pd.NA` will continue to be available for those +who were already using it. + +### Object-dtype "fallback" implementation + +To avoid a hard dependency on PyArrow for pandas 3.0, this PDEP proposes to keep +a "fallback" option in case PyArrow is not installed. The original `StringDtype` +backed by a numpy object-dtype array of Python strings can be mostly reused for +this (adding a new variant of the dtype) and a new `StringArray` subclass only +needs minor changes to follow the above-mentioned missing value semantics +([GH-58451](https://github.com/pandas-dev/pandas/pull/58451)). + +For pandas 3.0, this is the most realistic option given this implementation has +already been available for a long time. Beyond 3.0, further improvements such as +using NumPy 2.0 ([GH-58503](https://github.com/pandas-dev/pandas/issues/58503)) +or nanoarrow ([GH-58552](https://github.com/pandas-dev/pandas/issues/58552)) can +still be explored, but at that point that is an implementation detail that +should not have a direct impact on users (except for performance). + +For the original variant of `StringDtype` using `pd.NA`, currently the default +storage is `"python"` (the object-dtype based implementation). Also for this +variant, it is proposed to follow the same logic for determining the default +storage, i.e. default to `"pyarrow"` if available, and otherwise +fall back to `"python"`. + +### Naming + +Given the long history of this topic, the naming of the dtypes is a difficult +topic. + +In the first place, it should be acknowledged that most users should not need to +use storage-specific options. Users are expected to specify a generic name (such +as `"str"` or `"string"`), and that will give them their default string dtype +(which depends on whether PyArrow is installed or not). + +For the generic string alias to specify the dtype, `"string"` is already used +for the `StringDtype` using `pd.NA`. This PDEP proposes to use `"str"` for the +new default `StringDtype` using `NaN`. This ensures backwards compatibility for +code using `dtype="string"`, and was also chosen because `dtype="str"` or +`dtype=str` currently already works to ensure your data is converted to +strings (only using object dtype for the result). + +But for testing purposes and advanced use cases that want control over the exact +variant of the `StringDtype`, we need some way to specify this and distinguish +them from the other string dtypes. + +Currently (pandas 2.2), `StringDtype(storage="pyarrow_numpy")` is used for the new variant using `NaN`, +where the `"pyarrow_numpy"` storage was used to disambiguate from the existing +`"pyarrow"` option using `pd.NA`. However, `"pyarrow_numpy"` is a rather confusing +option and doesn't generalize well. Therefore, this PDEP proposes a new naming +scheme as outlined below, and `"pyarrow_numpy"` will be deprecated as an alias +in pandas 2.3 and removed in pandas 3.0. + +The `storage` keyword of `StringDtype` is kept to disambiguate the underlying +storage of the string data (using pyarrow or python objects), but an additional +`na_value` is introduced to disambiguate the the variants using NA semantics +and NaN semantics. + +Overview of the different ways to specify a dtype and the resulting concrete +dtype of the data: + +| User specification | Concrete dtype | String alias | Note | +|---------------------------------------------|---------------------------------------------------------------|---------------------------------------|----------| +| Unspecified (inference) | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `"str"` or `StringDtype(na_value=np.nan)` | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `StringDtype("pyarrow", na_value=np.nan)` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "str" | | +| `StringDtype("python", na_value=np.nan)` | `StringDtype(storage="python", na_value=np.nan)` | "str" | | +| `StringDtype("pyarrow")` | `StringDtype(storage="pyarrow", na_value=pd.NA)` | "string[pyarrow]" | | +| `StringDtype("python")` | `StringDtype(storage="python", na_value=pd.NA)` | "string[python]" | | +| `"string"` or `StringDtype()` | `StringDtype(storage="pyarrow"\|"python", na_value=pd.NA)` | "string[pyarrow]" or "string[python]" | (1) | +| `StringDtype("pyarrow_numpy")` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "string[pyarrow_numpy]" | (2) | + +Notes: + +- (1) You get "pyarrow" or "python" depending on pyarrow being installed. +- (2) "pyarrow_numpy" is kept temporarily because this is already in a released + version, but it will be deprecated in 2.x and removed for 3.0. + +For the new default string dtype, only the `"str"` alias can be used to +specify the dtype as a string, i.e. pandas would not provide a way to make the +underlying storage (pyarrow or python) explicit through the string alias. This +string alias is only a convenience shortcut and for most users `"str"` is +sufficient (they don't need to specify the storage), and the explicit +`pd.StringDtype(storage=..., na_value=np.nan)` is still available for more +fine-grained control. + +Also for the existing variant using `pd.NA`, specifying the storage through the +string alias could be deprecated, but that is left for a separate decision. + +## Alternatives + +### Why not delay introducing a default string dtype? + +To avoid introducing a new string dtype while other discussions and changes are +in flux (eventually making pyarrow a required dependency? adopting `pd.NA` as +the default missing value sentinel? using the new NumPy 2.0 capabilities? +overhauling all our dtypes to use a logical data type system?), introducing a +default string dtype could also be delayed until there is more clarity in those +other discussions. Specifically, it would avoid temporarily switching to use +`NaN` for the string dtype, while in a future version we might switch back +to `pd.NA` by default. + +However: + +1. Delaying has a cost: it further postpones introducing a dedicated string + dtype that has significant benefits for users, both in usability as (for the + part of the user base that has PyArrow installed) in performance. +2. In case pandas eventually transitions to use `pd.NA` as the default missing value + sentinel, a migration path for _all_ pandas data types will be needed, and thus + the challenges around this will not be unique to the string dtype and + therefore not a reason to delay this. + +Making this change now for 3.0 will benefit the majority of users, and the PDEP +author believes this is worth the cost of the added complexity around "yet +another dtype" (also for other data types we already have multiple variants). + +### Why not use the existing StringDtype with `pd.NA`? + +Wouldn't adding even more variants of the string dtype make things only more +confusing? Indeed, this proposal unfortunately introduces more variants of the +string dtype. However, the reason for this is to ensure the actual default user +experience is _less_ confusing, and the new string dtype fits better with the +other default data types. + +If the new default string data type would use `pd.NA`, then after some +operations, a user can easily end up with a DataFrame that mixes columns using +`NaN` semantics and columns using `NA` semantics (and thus a DataFrame that +could have columns with two different int64, two different float64, two different +bool, etc dtypes). This would lead to a very confusing default experience. + +With the proposed new variant of the StringDtype, this will ensure that for the +_default_ experience, a user will only see only 1 kind of integer dtype, only +kind of 1 bool dtype, etc. For now, a user should only get columns using `pd.NA` +when explicitly opting into this. + +### Naming alternatives + +An initial version of this PDEP proposed to use the `"string"` alias and the +default `pd.StringDtype()` class constructor for the new default dtype. +However, that caused a lot of discussion around backwards compatibility for +existing users of `dtype=pd.StringDtype()` and `dtype="string"`, that uses +`pd.NA` to represent missing values. + +During the discussion, several alternatives have been brought up. Both +alternative keyword names as using a different constructor. In the end, +this PDEP proposes to use a different string alias (`"str"`) but to keep +using the existing `pd.StringDtype` (with the existing `storage` keyword but +with an additional `na_value` keyword) for now to keep the changes as +minimal as possible, leaving a larger overhaul of the dtype system (potentially +including different constructor functions or namespace) for a future discussion. +See [GH-58613](https://github.com/pandas-dev/pandas/issues/58613) for the full +discussion. + +One consequence is that when using the class constructor for the default dtype, +it has to be used with non-default arguments, i.e. a user needs to specify +`pd.StringDtype(na_value=np.nan)` to get the default dtype using `NaN`. +Therefore, the pandas documentation will focus on the usage of `dtype="str"`. + +## Backward compatibility + +The most visible backwards incompatible change will be that columns with string +data will no longer have an `object` dtype. Therefore, code that assumes +`object` dtype (such as `ser.dtype == object`) will need to be updated. This +change is done as a hard break in a major release, as warning in advance for the +changed inference is deemed too noisy. + +To allow testing code in advance, the +`pd.options.future.infer_string = True` option is available for users. + +Otherwise, the actual string-specific functionality (such as the `.str` accessor +methods) should generally all keep working as is. + +By preserving the current missing value semantics, this proposal is also mostly +backwards compatible on this aspect. When storing strings in object dtype, pandas +however did allow using `None` as the missing value indicator as well (and in +certain cases such as the `shift` method, pandas even introduced this itself). +For all the cases where currently `None` was used as the missing value sentinel, +this will change to consistently use `NaN`. + +### For existing users of `StringDtype` + +Existing code that already opted in to use the `StringDtype` using `pd.NA` +should generally keep working as is. The latest version of this PDEP preserves +the behaviour of `dtype="string"` or `dtype=pd.StringDtype()` to mean the +`pd.NA` variant of the dtype. + +It does propose the change the default storage to `"pyarrow"` (if available) for +the opt-in `pd.NA` variant as well, but this should have limited, if any, +user-visible impact. + +## Timeline + +The future PyArrow-backed string dtype was already made available behind a feature +flag in pandas 2.1 (enabled by `pd.options.future.infer_string = True`). + +The variant using numpy object-dtype can also be backported to the 2.2.x branch +to allow easier testing. It is proposed to release this as 2.3.0 (created from +the 2.2.x branch, given that the main branch already includes many other changes +targeted for 3.0), together with the changes to the naming scheme. + +The 2.3.0 release would then have all future string functionality available +(both the pyarrow and object-dtype based variants of the default string dtype). + +For pandas 3.0, this `future.infer_string` flag becomes enabled by default. + +## PDEP-14 History + +- 3 May 2024: Initial version From 932355200384f575db64c4d4ce91278e855a21d1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 26 Jul 2024 11:36:25 +0200 Subject: [PATCH 197/396] TST / string dtype: add env variable to enable future_string and add test build (#58459) --- .github/workflows/unit-tests.yml | 7 ++++++- ci/run_tests.sh | 6 ++++++ pandas/core/config_init.py | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index b54fe8f044c78..7b6f4e152f3a3 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -85,6 +85,10 @@ jobs: env_file: actions-39.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" + - name: "Future infer strings" + env_file: actions-311.yaml + pattern: "not slow and not network and not single_cpu" + pandas_future_infer_string: "1" - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" @@ -103,7 +107,8 @@ jobs: LANG: ${{ matrix.lang || 'C.UTF-8' }} LC_ALL: ${{ matrix.lc_all || '' }} PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} - PANDAS_CI: ${{ matrix.pandas_ci || '1' }} + PANDAS_CI: '1' + PANDAS_FUTURE_INFER_STRING: ${{ matrix.pandas_future_infer_string || '0' }} TEST_ARGS: ${{ matrix.test_args || '' }} PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }} PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 39ab0890a32d1..9b48778c41804 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -16,5 +16,11 @@ if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi +# temporarily let pytest always succeed (many tests are not yet passing in the +# build enabling the future string dtype) +if [[ "$PANDAS_FUTURE_INFER_STRING" == "1" ]]; then + PYTEST_CMD="$PYTEST_CMD || true" +fi + echo $PYTEST_CMD sh -c "$PYTEST_CMD" diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index a8b63f97141c2..a6625d99eaa71 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -905,7 +905,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", From 7ee3154dfdac439d39bbdb08ac87fae3479bfd59 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 26 Jul 2024 19:20:59 +0200 Subject: [PATCH 198/396] REF (string dtype): rename using_pyarrow_string_dtype to using_string_dtype (#59320) --- pandas/_config/__init__.py | 2 +- pandas/_libs/lib.pyx | 4 +- pandas/core/construction.py | 16 ++--- pandas/core/dtypes/cast.py | 4 +- pandas/core/indexes/base.py | 4 +- pandas/core/internals/construction.py | 4 +- pandas/io/feather_format.py | 6 +- pandas/io/orc.py | 4 +- pandas/io/parquet.py | 4 +- pandas/io/parsers/arrow_parser_wrapper.py | 4 +- pandas/io/pytables.py | 10 +-- pandas/io/sql.py | 4 +- pandas/tests/arithmetic/test_object.py | 4 +- .../arrays/categorical/test_constructors.py | 4 +- pandas/tests/arrays/categorical/test_repr.py | 4 +- pandas/tests/base/test_misc.py | 4 +- pandas/tests/base/test_unique.py | 4 +- pandas/tests/extension/base/ops.py | 4 +- pandas/tests/extension/test_categorical.py | 4 +- .../frame/constructors/test_from_dict.py | 6 +- .../frame/constructors/test_from_records.py | 4 +- pandas/tests/frame/methods/test_fillna.py | 6 +- .../tests/frame/methods/test_interpolate.py | 6 +- pandas/tests/frame/methods/test_replace.py | 62 +++++-------------- pandas/tests/frame/test_api.py | 4 +- pandas/tests/frame/test_arithmetic.py | 6 +- pandas/tests/frame/test_constructors.py | 6 +- pandas/tests/frame/test_reductions.py | 8 +-- pandas/tests/frame/test_repr.py | 4 +- .../tests/indexes/base_class/test_formats.py | 6 +- .../indexes/categorical/test_category.py | 4 +- .../tests/indexes/categorical/test_formats.py | 4 +- pandas/tests/indexes/interval/test_formats.py | 4 +- pandas/tests/indexes/test_old_base.py | 4 +- pandas/tests/indexing/test_coercion.py | 4 +- pandas/tests/indexing/test_indexing.py | 8 +-- pandas/tests/indexing/test_loc.py | 4 +- pandas/tests/io/excel/test_readers.py | 6 +- pandas/tests/io/formats/test_format.py | 10 +-- pandas/tests/io/formats/test_to_string.py | 4 +- pandas/tests/io/json/test_pandas.py | 6 +- pandas/tests/reshape/test_pivot.py | 8 +-- pandas/tests/series/indexing/test_where.py | 4 +- pandas/tests/series/methods/test_reindex.py | 6 +- pandas/tests/series/methods/test_replace.py | 6 +- pandas/tests/series/test_formats.py | 4 +- 46 files changed, 122 insertions(+), 176 deletions(-) diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 97784c924dab4..838b6affd2836 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -52,6 +52,6 @@ def using_nullable_dtypes() -> bool: return _mode_options["nullable_dtypes"] -def using_pyarrow_string_dtype() -> bool: +def using_string_dtype() -> bool: _mode_options = _global_config["future"] return _mode_options["infer_string"] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7656e8d986117..1c2bba031e523 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -37,7 +37,7 @@ from cython cimport ( floating, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs.missing import check_na_tuples_nonequal @@ -2725,7 +2725,7 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): + if using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage="pyarrow_numpy") diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f8250ae475a10..3de28ea242ce8 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -19,7 +19,7 @@ import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas._libs.tslibs import ( @@ -566,11 +566,7 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") - if ( - isinstance(data, str) - and using_pyarrow_string_dtype() - and original_dtype is None - ): + if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype dtype = StringDtype("pyarrow_numpy") @@ -604,14 +600,10 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) - if ( - object_index - and using_pyarrow_string_dtype() - and is_string_dtype(subarr) - ): + if object_index and using_string_dtype() and is_string_dtype(subarr): # Avoid inference when string option is set subarr = data - elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage="pyarrow_numpy") diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b72293b52df06..af4c7c2c7c4f8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,7 +18,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import ( Interval, @@ -798,7 +798,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj - if using_pyarrow_string_dtype(): + if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage="pyarrow_numpy") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6822c2c99427e..76ddfffaa8a4d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -23,7 +23,7 @@ from pandas._config import ( get_option, using_copy_on_write, - using_pyarrow_string_dtype, + using_string_dtype, ) from pandas._libs import ( @@ -7011,7 +7011,7 @@ def insert(self, loc: int, item) -> Index: out = Index._with_infer(new_values, name=self.name) if ( - using_pyarrow_string_dtype() + using_string_dtype() and is_string_dtype(out.dtype) and new_values.dtype == object ): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 609d2c9a7a285..14d7cadd21400 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,7 +13,7 @@ import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib @@ -375,7 +375,7 @@ def ndarray_to_mgr( bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] - elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): + elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): dtype = StringDtype(storage="pyarrow_numpy") obj_columns = list(values) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index d0aaf83b84cb2..68c73483add3f 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -6,7 +6,7 @@ Any, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -120,7 +120,7 @@ def read_feather( with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: - if dtype_backend is lib.no_default and not using_pyarrow_string_dtype(): + if dtype_backend is lib.no_default and not using_string_dtype(): return feather.read_feather( handles.handle, columns=columns, use_threads=bool(use_threads) ) @@ -137,7 +137,7 @@ def read_feather( elif dtype_backend == "pyarrow": return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) else: raise NotImplementedError diff --git a/pandas/io/orc.py b/pandas/io/orc.py index fed9463c38d5d..5706336b71697 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,7 +9,7 @@ Literal, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -127,7 +127,7 @@ def read_orc( df = pa_table.to_pandas(types_mapper=mapping.get) return df else: - if using_pyarrow_string_dtype(): + if using_string_dtype(): types_mapper = arrow_string_types_mapper() else: types_mapper = None diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 9570d6f8b26bd..cc33c87dfc55a 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -12,7 +12,7 @@ import warnings from warnings import catch_warnings -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._config.config import _get_option from pandas._libs import lib @@ -257,7 +257,7 @@ def read( to_pandas_kwargs["types_mapper"] = mapping.get elif dtype_backend == "pyarrow": to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() manager = _get_option("mode.data_manager", silent=True) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 890b22154648e..c774638fd73f7 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -295,7 +295,7 @@ def read(self) -> DataFrame: dtype_mapping = _arrow_dtype_mapping() dtype_mapping[pa.null()] = pd.Int64Dtype() frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 13c2f10785124..12bb93a63f850 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -31,7 +31,7 @@ config, get_option, using_copy_on_write, - using_pyarrow_string_dtype, + using_string_dtype, ) from pandas._libs import ( @@ -3224,7 +3224,7 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): + if using_string_dtype() and is_string_array(values, skipna=True): result = result.astype("string[pyarrow_numpy]") return result @@ -3293,7 +3293,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): + if using_string_dtype() and is_string_array(values, skipna=True): df = df.astype("string[pyarrow_numpy]") dfs.append(df) @@ -4679,9 +4679,9 @@ def read( else: # Categorical df = DataFrame._from_arrays([values], columns=cols_, index=index_) - if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"): + if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_pyarrow_string_dtype() and is_string_array( + if using_string_dtype() and is_string_array( values, # type: ignore[arg-type] skipna=True, ): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 3e17175167f25..03ef1792f1fb8 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -32,7 +32,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -2215,7 +2215,7 @@ def read_table( from pandas.io._util import _arrow_dtype_mapping mapping = _arrow_dtype_mapping().get - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): from pandas.io._util import arrow_string_types_mapper arrow_string_types_mapper() diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 4ffd76722286a..884e6e002800e 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,7 +8,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -303,7 +303,7 @@ def test_iadd_string(self): index += "_x" assert "a_x" in index - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="add doesn't work") + @pytest.mark.xfail(using_string_dtype(), reason="add doesn't work") def test_add(self): index = pd.Index([str(i) for i in range(10)]) expected = pd.Index(index.values * 2) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 373f1c95463fc..6813683cb5219 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import ( is_float_dtype, @@ -449,7 +449,7 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings") + @pytest.mark.xfail(using_string_dtype(), reason="Can't be NumPy strings") def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index ef0315130215c..e2e5d47f50209 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( Categorical, @@ -78,7 +78,7 @@ def test_print_none_width(self): assert exp == repr(a) @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Change once infer_string is set to True by default", ) def test_unicode_print(self): diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 65e234e799353..3e0d8b1afedc0 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import PYPY @@ -83,7 +83,7 @@ def test_ndarray_compat_properties(index_or_series_obj): @pytest.mark.skipif( - PYPY or using_pyarrow_string_dtype(), + PYPY or using_string_dtype(), reason="not relevant for PyPy doesn't work properly for arrow strings", ) def test_memory_usage(index_or_series_memory_obj): diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index d3fe144f70cfc..8314fa56b5bda 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd import pandas._testing as tm @@ -100,7 +100,7 @@ def test_nunique_null(null_obj, index_or_series_obj): @pytest.mark.single_cpu -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails") +@pytest.mark.xfail(using_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 5cd66d8a874c7..fad2560265d21 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import is_string_dtype @@ -37,7 +37,7 @@ def _get_expected_exception( else: result = self.frame_scalar_exc - if using_pyarrow_string_dtype() and result is not None: + if using_string_dtype() and result is not None: import pyarrow as pa result = ( # type: ignore[assignment] diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 6f33b18b19c51..135ea67c924d0 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -18,7 +18,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import Categorical @@ -103,7 +103,7 @@ def test_contains(self, data, data_missing): continue assert na_value_obj not in data # this section suffers from super method - if not using_pyarrow_string_dtype(): + if not using_string_dtype(): assert na_value_obj in data_missing def test_empty(self, dtype): diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 60a8e688b3b8a..4237e796e052e 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( DataFrame, @@ -44,9 +44,7 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif( - using_pyarrow_string_dtype(), reason="columns inferring logic broken" - ) + @pytest.mark.skipif(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 3622571f1365d..4eaf32798ca60 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -6,7 +6,7 @@ import pytest import pytz -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import is_platform_little_endian @@ -59,7 +59,7 @@ def test_from_records_with_datetimes(self): tm.assert_frame_equal(result, expected) @pytest.mark.skipif( - using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work" + using_string_dtype(), reason="dtype checking logic doesn't work" ) def test_from_records_sequencelike(self): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 89c50a8c21e1c..774e938e887b4 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -91,7 +91,7 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(ValueError, match=msg): datetime_frame.fillna(5, method="ffill") - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -664,7 +664,7 @@ def test_fillna_col_reordering(self): filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 252b950004bea..7bde3041d46c9 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.errors import ChainedAssignmentError import pandas.util._test_decorators as td @@ -70,7 +70,7 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert orig.squeeze()[1] == 1.5 @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + using_string_dtype(), reason="interpolate doesn't work for string" ) def test_interp_basic(self, using_copy_on_write): df = DataFrame( @@ -114,7 +114,7 @@ def test_interp_basic(self, using_copy_on_write): assert np.shares_memory(df["D"]._values, dvalues) @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + using_string_dtype(), reason="interpolate doesn't work for string" ) def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 8bfa98042eb07..0884c091ba96a 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -30,9 +30,7 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -299,9 +297,7 @@ def test_regex_replace_dict_nested_non_first_character( expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) @@ -310,9 +306,7 @@ def test_regex_replace_dict_nested_gh4115(self): result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( @@ -341,9 +335,7 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -359,9 +351,7 @@ def test_regex_replace_str_to_numeric(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -566,9 +556,7 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -580,9 +568,7 @@ def test_replace_convert(self): res = rep.dtypes tm.assert_series_equal(expec, res) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan @@ -946,9 +932,7 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -977,9 +961,7 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_dict_no_regex(self): answer = Series( { @@ -1003,9 +985,7 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_series_no_regex(self): answer = Series( { @@ -1112,9 +1092,7 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_swapping_bug(self, using_infer_string): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) @@ -1126,9 +1104,7 @@ def test_replace_swapping_bug(self, using_infer_string): expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_period(self): d = { "fname": { @@ -1165,9 +1141,7 @@ def test_replace_period(self): result = df.replace(d) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_datetime(self): d = { "fname": { @@ -1393,9 +1367,7 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize( "replacer", [ @@ -1672,9 +1644,7 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index c7b444045a0f2..339800538f47b 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._config.config import option_context import pandas as pd @@ -113,7 +113,7 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="surrogates not allowed") + @pytest.mark.xfail(using_string_dtype(), reason="surrogates not allowed") def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 0593de7556406..e2d469f1124d3 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,7 +11,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -253,9 +253,7 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't compare string and int" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't compare string and int") def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index cae2f6e81d384..b45eca127b3e4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,7 +21,7 @@ import pytest import pytz -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat.numpy import np_version_gt2 @@ -327,7 +327,7 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") + @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") @@ -335,7 +335,7 @@ def test_1d_object_array_does_not_copy(self): assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") + @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 66145c32c18d7..45d06c56d353f 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import ( IS64, @@ -465,7 +465,7 @@ def test_mixed_ops(self, op): getattr(df, op)() @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work for arrow strings" + using_string_dtype(), reason="sum doesn't work for arrow strings" ) def test_reduce_mixed_frame(self): # GH 6806 @@ -1960,9 +1960,7 @@ def test_sum_timedelta64_skipna_false(using_array_manager, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work with arrow strings" -) +@pytest.mark.xfail(using_string_dtype(), reason="sum doesn't work with arrow strings") def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index 776007fb9691d..f7700af6beea0 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( NA, @@ -176,7 +176,7 @@ def test_repr_mixed_big(self): repr(biggie) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="/r in") + @pytest.mark.xfail(using_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index f30b578cfcf56..955e3be107f75 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import Index @@ -16,7 +16,7 @@ def test_repr_is_valid_construction_code(self): res = eval(repr(idx)) tm.assert_index_equal(res, idx) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -81,7 +81,7 @@ def test_string_index_repr(self, index, expected): result = repr(index) assert result == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 03a298a13dc2b..166e628ae4b3e 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import index as libindex from pandas._libs.arrays import NDArrayBacked @@ -196,7 +196,7 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr doesn't roundtrip") + @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 522ca1bc2afde..e8489e4ad8161 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -3,7 +3,7 @@ """ import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import CategoricalIndex @@ -19,7 +19,7 @@ def test_format_different_scalar_lengths(self): with tm.assert_produces_warning(FutureWarning, match=msg): assert idx.format() == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index 3b8e18463160f..d20611a61b154 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( DataFrame, @@ -42,7 +42,7 @@ def test_repr_missing(self, constructor, expected, using_infer_string, request): result = repr(obj) assert result == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_repr_floats(self): # GH 32553 diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 1787379b0faee..bfbb0c3dda5c5 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs.tslibs import Timestamp @@ -426,7 +426,7 @@ def test_insert_base(self, index): assert index[0:4].equals(result) @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="completely different behavior, tested elsewher", ) def test_insert_out_of_bounds(self, index): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 0e32399b131c3..31840cb84b7c4 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import ( IS64, @@ -834,7 +834,7 @@ def replacer(self, how, from_key, to_key): return replacer # Expected needs adjustment for the infer string option, seems to work as expecetd - @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex") + @pytest.mark.skipif(using_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 57f45f867254d..e032936d09ce4 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,7 +8,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.errors import IndexingError @@ -461,9 +461,7 @@ def test_set_index_nan(self): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't multiply arrow strings" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't multiply arrow strings") def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -689,7 +687,7 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") + @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 0cd1390d41461..d33719f3e2115 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -12,7 +12,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import index as libindex from pandas.compat.numpy import np_version_gt2 @@ -1262,7 +1262,7 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") + @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") def test_loc_setitem_str_to_small_float_conversion_type(self): # GH#20388 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8da8535952dcf..8d36dc7520019 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -16,7 +16,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -659,9 +659,7 @@ def test_dtype_backend_and_dtype(self, read_ext): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="infer_string takes precedence" - ) + @pytest.mark.xfail(using_string_dtype(), reason="infer_string takes precedence") def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0ca29c219b55b..535ef76cb12f4 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -11,7 +11,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -1396,9 +1396,7 @@ def test_unicode_name_in_footer(self): sf = fmt.SeriesFormatter(s, name="\u05e2\u05d1\u05e8\u05d9\u05ea") sf._get_footer() # should not raise exception - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="Fixup when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="Fixup when arrow is default") def test_east_asian_unicode_series(self): # not aligned properly because of east asian width @@ -1773,9 +1771,7 @@ def chck_ncols(self, s): ncolsizes = len({len(line.strip()) for line in lines}) assert ncolsizes == 1 - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="change when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="change when arrow is default") def test_format_explicit(self): test_sers = gen_series_formatting() with option_context("display.max_rows", 4, "display.show_dimensions", False): diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 2e5a5005cb076..164e514262603 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( CategoricalIndex, @@ -851,7 +851,7 @@ def test_to_string(self): frame.to_string() # TODO: split or simplify this test? - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="fix when arrow is default") + @pytest.mark.xfail(using_string_dtype(), reason="fix when arrow is default") def test_to_string_index_with_nan(self): # GH#2850 df = DataFrame( diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5279f3f1cdfbe..2157498aea95e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -13,7 +13,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import IS64 import pandas.util._test_decorators as td @@ -1492,7 +1492,7 @@ def test_from_json_to_json_table_dtypes(self): # TODO: We are casting to string which coerces None to NaN before casting back # to object, ending up with incorrect na values - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion") + @pytest.mark.xfail(using_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 @@ -1751,7 +1751,7 @@ def test_to_json_indent(self, indent): assert result == expected @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Adjust expected when infer_string is default, no bug here, " "just a complicated parametrization", ) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 18a449b4d0c67..7b27d19483bd2 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.errors import PerformanceWarning @@ -2611,7 +2611,7 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() # pylint: disable=missing-kwoa - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2627,7 +2627,7 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2641,7 +2641,7 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index c978481ca9988..013386202c966 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import is_integer @@ -232,7 +232,7 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string") +@pytest.mark.xfail(using_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 6f0c8d751a92a..aa44eccf67446 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -24,9 +24,7 @@ import pandas._testing as tm -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="share memory doesn't work for arrow" -) +@pytest.mark.xfail(using_string_dtype(), reason="share memory doesn't work for arrow") def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index b0f4e233ba5eb..c0191abced797 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd import pandas._testing as tm @@ -391,7 +391,7 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ @@ -731,7 +731,7 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 1 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index a1c5018ea7961..f684e8fc1e724 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -145,7 +145,7 @@ def test_tidy_repr_name_0(self, arg): assert "Name: 0" in rep_str @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="TODO: investigate why this is failing" + using_string_dtype(), reason="TODO: investigate why this is failing" ) def test_newline(self): ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) From f5ca6835bcac39a09dfad21bac9afee4e27e04a9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Jul 2024 17:14:00 +0200 Subject: [PATCH 199/396] TST (string dtype): clean-up xpasssing tests with future string dtype (#59323) --- pandas/tests/arithmetic/test_object.py | 3 --- pandas/tests/base/test_unique.py | 5 +---- pandas/tests/frame/constructors/test_from_dict.py | 3 ++- pandas/tests/frame/constructors/test_from_records.py | 4 +--- pandas/tests/frame/methods/test_fillna.py | 2 ++ pandas/tests/frame/methods/test_info.py | 11 ++++++----- pandas/tests/frame/methods/test_interpolate.py | 1 + pandas/tests/frame/test_arithmetic.py | 3 --- pandas/tests/indexes/interval/test_formats.py | 7 ++----- pandas/tests/indexing/test_coercion.py | 4 ---- pandas/tests/indexing/test_indexing.py | 4 ---- pandas/tests/series/methods/test_reindex.py | 3 --- pandas/tests/series/methods/test_replace.py | 1 - pandas/tests/series/test_formats.py | 2 +- 14 files changed, 16 insertions(+), 37 deletions(-) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 884e6e002800e..4b5156d0007bb 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -303,7 +301,6 @@ def test_iadd_string(self): index += "_x" assert "a_x" in index - @pytest.mark.xfail(using_string_dtype(), reason="add doesn't work") def test_add(self): index = pd.Index([str(i) for i in range(10)]) expected = pd.Index(index.values * 2) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 8314fa56b5bda..1add56b47b363 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -100,12 +98,11 @@ def test_nunique_null(null_obj, index_or_series_obj): @pytest.mark.single_cpu -@pytest.mark.xfail(using_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji - obj = index_or_series([uval] * 2) + obj = index_or_series([uval] * 2, dtype=object) result = obj.unique() if isinstance(obj, pd.Index): diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 4237e796e052e..fc7c03dc25839 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -44,7 +44,7 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(using_string_dtype(), reason="columns inferring logic broken") + @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), @@ -108,6 +108,7 @@ def test_constructor_list_of_series(self): expected = DataFrame.from_dict(sdict, orient="index") tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series recons = DataFrame.from_dict(data_dict, orient="index") diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 4eaf32798ca60..58e47ba48f894 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -58,9 +58,7 @@ def test_from_records_with_datetimes(self): expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]") tm.assert_frame_equal(result, expected) - @pytest.mark.skipif( - using_string_dtype(), reason="dtype checking logic doesn't work" - ) + @pytest.mark.xfail(using_string_dtype(), reason="dtype checking logic doesn't work") def test_from_records_sequencelike(self): df = DataFrame( { diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 774e938e887b4..e2baa2567f5b4 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -91,6 +91,7 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(ValueError, match=msg): datetime_frame.fillna(5, method="ffill") + # TODO(infer_string) test as actual error instead of xfail @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame @@ -664,6 +665,7 @@ def test_fillna_col_reordering(self): filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() + # TODO(infer_string) test as actual error instead of xfail @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index fcb7677f03f27..c3d02a07f397e 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -15,6 +15,7 @@ from pandas import ( CategoricalIndex, DataFrame, + Index, MultiIndex, Series, date_range, @@ -360,7 +361,7 @@ def test_info_memory_usage(): df = DataFrame(data) df.columns = dtypes - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) @@ -398,25 +399,25 @@ def test_info_memory_usage(): @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") def test_info_memory_usage_deep_not_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() > df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() @pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() == df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 7bde3041d46c9..bbb5e59e4a274 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -69,6 +69,7 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 + # TODO(infer_string) raise proper TypeError in case of string dtype @pytest.mark.xfail( using_string_dtype(), reason="interpolate doesn't work for string" ) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index e2d469f1124d3..f68785a354d7e 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -253,7 +251,6 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) - @pytest.mark.xfail(using_string_dtype(), reason="can't compare string and int") def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index d20611a61b154..f858ae137ca4e 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, DatetimeIndex, @@ -42,12 +40,11 @@ def test_repr_missing(self, constructor, expected, using_infer_string, request): result = repr(obj) assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_repr_floats(self): # GH 32553 markers = Series( - ["foo", "bar"], + [1, 2], index=IntervalIndex( [ Interval(left, right) @@ -59,7 +56,7 @@ def test_repr_floats(self): ), ) result = str(markers) - expected = "(329.973, 345.137] foo\n(345.137, 360.191] bar\ndtype: object" + expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected @pytest.mark.parametrize( diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 31840cb84b7c4..c0a62ecb06f56 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( IS64, is_platform_windows, @@ -833,8 +831,6 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - # Expected needs adjustment for the infer string option, seems to work as expecetd - @pytest.mark.skipif(using_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index e032936d09ce4..162c2c58ff744 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -461,7 +459,6 @@ def test_set_index_nan(self): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail(using_string_dtype(), reason="can't multiply arrow strings") def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -687,7 +684,6 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index aa44eccf67446..0923a2d42ce10 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas import ( @@ -24,7 +22,6 @@ import pandas._testing as tm -@pytest.mark.xfail(using_string_dtype(), reason="share memory doesn't work for arrow") def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index c0191abced797..850740fac907d 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -391,7 +391,6 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index f684e8fc1e724..4939f3221d268 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -145,7 +145,7 @@ def test_tidy_repr_name_0(self, arg): assert "Name: 0" in rep_str @pytest.mark.xfail( - using_string_dtype(), reason="TODO: investigate why this is failing" + using_string_dtype(), reason="TODO(infer_string): investigate failure" ) def test_newline(self): ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) From e5c6a082cca41942bd3e935e6aa42e34dd4b2924 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jul 2024 21:40:31 +0200 Subject: [PATCH 200/396] String dtype: rename the storage options and add `na_value` keyword in `StringDtype()` (#59330) * rename storage option and add na_value keyword * update init * fix propagating na_value to Array class + fix some tests * fix more tests * disallow pyarrow_numpy as option + fix more cases of checking storage to be pyarrow_numpy * restore pyarrow_numpy as option for now * linting * try fix typing * try fix typing * fix dtype equality to take into account the NaN vs NA * fix pickling of dtype * fix test_convert_dtypes * update expected result for dtype='string' * suppress typing error with _metadata attribute --- pandas/_libs/lib.pyx | 2 +- pandas/_testing/__init__.py | 4 +- pandas/core/arrays/arrow/array.py | 6 +- pandas/core/arrays/string_.py | 89 ++++++++++++++----- pandas/core/arrays/string_arrow.py | 11 ++- pandas/core/construction.py | 4 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/indexes/base.py | 3 +- pandas/core/internals/construction.py | 2 +- pandas/core/reshape/encoding.py | 3 +- pandas/core/reshape/merge.py | 3 +- pandas/core/tools/numeric.py | 9 +- pandas/io/_util.py | 6 +- pandas/tests/arrays/string_/test_string.py | 83 +++++++++++------ .../tests/arrays/string_/test_string_arrow.py | 4 +- pandas/tests/extension/base/methods.py | 8 +- pandas/tests/extension/test_string.py | 44 +++++---- .../frame/methods/test_convert_dtypes.py | 1 + pandas/tests/series/test_constructors.py | 7 +- pandas/tests/strings/__init__.py | 2 +- 20 files changed, 186 insertions(+), 107 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1c2bba031e523..1222b33aac3c1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2728,7 +2728,7 @@ def maybe_convert_objects(ndarray[object] objects, if using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 87d419e2db8dd..994b351acf42c 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -527,14 +527,14 @@ def shares_memory(left, right) -> bool: if ( isinstance(left, ExtensionArray) and is_string_dtype(left.dtype) - and left.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + and left.dtype.storage == "pyarrow" # type: ignore[attr-defined] ): # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left = cast("ArrowExtensionArray", left) if ( isinstance(right, ExtensionArray) and is_string_dtype(right.dtype) - and right.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + and right.dtype.storage == "pyarrow" # type: ignore[attr-defined] ): right = cast("ArrowExtensionArray", right) left_pa_data = left._pa_array diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f2b8aa75ca5bf..a156042ac0c0e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -570,10 +570,8 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ): + if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 00197a150fb97..c40f5b8f58d9e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -8,7 +8,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_string_dtype, +) from pandas._libs import ( lib, @@ -80,8 +83,10 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow", "pyarrow_numpy"}, optional + storage : {"python", "pyarrow"}, optional If not given, the value of ``pd.options.mode.string_storage``. + na_value : {np.nan, pd.NA}, default pd.NA + Whether the dtype follows NaN or NA missing value semantics. Attributes ---------- @@ -108,30 +113,67 @@ class StringDtype(StorageExtensionDtype): # follows NumPy semantics, which uses nan. @property def na_value(self) -> libmissing.NAType | float: # type: ignore[override] - if self.storage == "pyarrow_numpy": - return np.nan - else: - return libmissing.NA + return self._na_value - _metadata = ("storage",) + _metadata = ("storage", "_na_value") # type: ignore[assignment] - def __init__(self, storage=None) -> None: + def __init__( + self, + storage: str | None = None, + na_value: libmissing.NAType | float = libmissing.NA, + ) -> None: + # infer defaults if storage is None: - infer_string = get_option("future.infer_string") - if infer_string: - storage = "pyarrow_numpy" + if using_string_dtype(): + storage = "pyarrow" else: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow", "pyarrow_numpy"}: + + if storage == "pyarrow_numpy": + # TODO raise a deprecation warning + storage = "pyarrow" + na_value = np.nan + + # validate options + if storage not in {"python", "pyarrow"}: raise ValueError( - f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " - f"Got {storage} instead." + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: + if storage == "pyarrow" and pa_version_under10p1: raise ImportError( "pyarrow>=10.0.1 is required for PyArrow backed StringArray." ) + + if isinstance(na_value, float) and np.isnan(na_value): + # when passed a NaN value, always set to np.nan to ensure we use + # a consistent NaN value (and we can use `dtype.na_value is np.nan`) + na_value = np.nan + elif na_value is not libmissing.NA: + raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") + self.storage = storage + self._na_value = na_value + + def __eq__(self, other: object) -> bool: + # we need to override the base class __eq__ because na_value (NA or NaN) + # cannot be checked with normal `==` + if isinstance(other, str): + if other == self.name: + return True + try: + other = self.construct_from_string(other) + except TypeError: + return False + if isinstance(other, type(self)): + return self.storage == other.storage and self.na_value is other.na_value + return False + + def __hash__(self) -> int: + # need to override __hash__ as well because of overriding __eq__ + return super().__hash__() + + def __reduce__(self): + return StringDtype, (self.storage, self.na_value) @property def type(self) -> type[str]: @@ -176,6 +218,7 @@ def construct_from_string(cls, string) -> Self: elif string == "string[pyarrow]": return cls(storage="pyarrow") elif string == "string[pyarrow_numpy]": + # TODO deprecate return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -200,7 +243,7 @@ def construct_array_type( # type: ignore[override] if self.storage == "python": return StringArray - elif self.storage == "pyarrow": + elif self.storage == "pyarrow" and self._na_value is libmissing.NA: return ArrowStringArray else: return ArrowStringArrayNumpySemantics @@ -212,13 +255,17 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ if self.storage == "pyarrow": - from pandas.core.arrays.string_arrow import ArrowStringArray + if self._na_value is libmissing.NA: + from pandas.core.arrays.string_arrow import ArrowStringArray + + return ArrowStringArray(array) + else: + from pandas.core.arrays.string_arrow import ( + ArrowStringArrayNumpySemantics, + ) - return ArrowStringArray(array) - elif self.storage == "pyarrow_numpy": - from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + return ArrowStringArrayNumpySemantics(array) - return ArrowStringArrayNumpySemantics(array) else: import pyarrow diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 50527dace0b82..94f6f9064885e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -125,6 +125,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] _storage = "pyarrow" + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values) -> None: _chk_pyarrow_available() @@ -134,7 +135,7 @@ def __init__(self, values) -> None: values = pc.cast(values, pa.large_string()) super().__init__(values) - self._dtype = StringDtype(storage=self._storage) + self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) if not pa.types.is_large_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) @@ -179,10 +180,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ) + assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -598,7 +596,8 @@ def _rank( class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow_numpy" + _storage = "pyarrow" + _na_value = np.nan @classmethod def _result_converter(cls, values, na=None): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 3de28ea242ce8..748b9e4947bec 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -569,7 +569,7 @@ def sanitize_array( if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype("pyarrow_numpy") + dtype = StringDtype("pyarrow", na_value=np.nan) data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data @@ -606,7 +606,7 @@ def sanitize_array( elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index af4c7c2c7c4f8..5d4e56cd8f800 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 76ddfffaa8a4d..11d2436b0e095 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5620,9 +5620,10 @@ def equals(self, other: Any) -> bool: if ( isinstance(self.dtype, StringDtype) - and self.dtype.storage == "pyarrow_numpy" + and self.dtype.na_value is np.nan and other.dtype != self.dtype ): + # TODO(infer_string) can we avoid this special case? # special case for object behavior return other.equals(self.astype(object)) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 14d7cadd21400..144416fc11691 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -376,7 +376,7 @@ def ndarray_to_mgr( nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) obj_columns = list(values) block_values = [ diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 3ed67bb7b7c02..85c10f1166577 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -13,6 +13,7 @@ import numpy as np +from pandas._libs import missing as libmissing from pandas._libs.sparse import IntIndex from pandas.core.dtypes.common import ( @@ -260,7 +261,7 @@ def _get_dummies_1d( dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] elif ( isinstance(input_dtype, StringDtype) - and input_dtype.storage != "pyarrow_numpy" + and input_dtype.na_value is libmissing.NA ): dtype = pandas_dtype("boolean") # type: ignore[assignment] else: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 646f40f6141d8..dc2df25c3f786 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2473,8 +2473,7 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) - and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] + isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): import pyarrow as pa import pyarrow.compute as pc diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 09652a7d8bc92..ca703e0362611 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -8,7 +8,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -235,7 +238,7 @@ def to_numeric( coerce_numeric=coerce_numeric, convert_to_masked_nullable=dtype_backend is not lib.no_default or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy", + and values_dtype.na_value is libmissing.NA, ) except (ValueError, TypeError): if errors == "raise": @@ -250,7 +253,7 @@ def to_numeric( dtype_backend is not lib.no_default and new_mask is None or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy" + and values_dtype.na_value is libmissing.NA ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 3b2ae5daffdba..2dc3b0e6c80ef 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -2,6 +2,8 @@ from typing import Callable +import numpy as np + from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -29,6 +31,6 @@ def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") return { - pa.string(): pd.StringDtype(storage="pyarrow_numpy"), - pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), + pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), + pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), }.get diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 320bdca60a932..3ef7862d739cd 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -19,13 +19,6 @@ ) -def na_val(dtype): - if dtype.storage == "pyarrow_numpy": - return np.nan - else: - return pd.NA - - @pytest.fixture def dtype(string_storage): """Fixture giving StringDtype from parametrized 'string_storage'""" @@ -38,24 +31,45 @@ def cls(dtype): return dtype.construct_array_type() +def test_dtype_equality(): + pytest.importorskip("pyarrow") + + dtype1 = pd.StringDtype("python") + dtype2 = pd.StringDtype("pyarrow") + dtype3 = pd.StringDtype("pyarrow", na_value=np.nan) + + assert dtype1 == pd.StringDtype("python", na_value=pd.NA) + assert dtype1 != dtype2 + assert dtype1 != dtype3 + + assert dtype2 == pd.StringDtype("pyarrow", na_value=pd.NA) + assert dtype2 != dtype1 + assert dtype2 != dtype3 + + assert dtype3 == pd.StringDtype("pyarrow", na_value=np.nan) + assert dtype3 == pd.StringDtype("pyarrow", na_value=float("nan")) + assert dtype3 != dtype1 + assert dtype3 != dtype2 + + def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = " A\n0 a\n1 NaN\n2 b" else: expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - if dtype.storage == "pyarrow": + if dtype.storage == "pyarrow" and dtype.na_value is pd.NA: arr_name = "ArrowStringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" - elif dtype.storage == "pyarrow_numpy": + elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: @@ -67,7 +81,7 @@ def test_repr(dtype): def test_none_to_nan(cls, dtype): a = cls._from_sequence(["a", None, "b"], dtype=dtype) assert a[1] is not None - assert a[1] is na_val(a.dtype) + assert a[1] is a.dtype.na_value def test_setitem_validates(cls, dtype): @@ -224,7 +238,7 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = np.array([getattr(item, op_name)(other) for item in a]) if comparison_op == operator.ne: expected[1] = True @@ -243,7 +257,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, True]) else: @@ -270,7 +284,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected_data = { "__eq__": [False, False, False], "__ne__": [True, True, True], @@ -292,7 +306,7 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, False]) else: @@ -386,7 +400,7 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: err = ValueError msg = "cannot convert float NaN to integer" else: @@ -443,7 +457,7 @@ def test_min_max(method, skipna, dtype): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is na_val(arr.dtype) + assert result is arr.dtype.na_value @pytest.mark.parametrize("method", ["min", "max"]) @@ -492,7 +506,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) - if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: + if dtype.storage == "pyarrow" and pa_version_under12p0: expected = pa.chunked_array(expected) if dtype.storage == "python": expected = pc.cast(expected, pa.string()) @@ -524,7 +538,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): expected = df.astype(f"string[{string_storage2}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is na_val(result["a"].dtype) + assert result.loc[2, "a"] is result["a"].dtype.na_value @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") @@ -558,10 +572,10 @@ def test_arrow_load_from_zero_chunks( def test_value_counts_na(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -575,10 +589,10 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "double[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = np.float64 + elif dtype.storage == "pyarrow": + exp_dtype = "double[pyarrow]" else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) @@ -612,6 +626,19 @@ def test_use_inf_as_na(values, expected, dtype): tm.assert_frame_equal(result, expected) +def test_value_counts_sort_false(dtype): + if dtype.na_value is np.nan: + exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" + else: + exp_dtype = "Int64" + ser = pd.Series(["a", "b", "c", "b"], dtype=dtype) + result = ser.value_counts(sort=False) + expected = pd.Series([1, 2, 1], index=ser[:3], dtype=exp_dtype, name="count") + tm.assert_series_equal(result, expected) + + def test_memory_usage(dtype, arrow_string_storage): # GH 33963 @@ -635,7 +662,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", na_val(dtype), "b"], dtype=object) + expected = np.array(["a", dtype.na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -675,7 +702,7 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is na_val(ser.dtype) + assert ser.array[1] is ser.dtype.na_value # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index d7811b6fed883..06013c3d11664 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -29,6 +29,8 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): if using_infer_string and string_storage != "pyarrow_numpy": request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) + if string_storage == "pyarrow_numpy": + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) @@ -260,6 +262,6 @@ def test_pickle_roundtrip(dtype): def test_string_dtype_error_message(): # GH#55051 pytest.importorskip("pyarrow") - msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + msg = "Storage must be 'python' or 'pyarrow'." with pytest.raises(ValueError, match=msg): StringDtype("bla") diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c803a8113b4a4..5cb2c14e4c841 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -66,14 +66,14 @@ def test_value_counts_with_normalize(self, data): expected = pd.Series(0.0, index=result.index, name="proportion") expected[result > 0] = 1 / len(values) - if getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( + if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan: + # TODO: avoid special-casing + expected = expected.astype("float64") + elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( data.dtype, pd.ArrowDtype ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") - elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": - # TODO: avoid special-casing - expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 2d5a134f8560a..f3eec142c1968 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -95,9 +95,15 @@ def data_for_grouping(dtype, chunked): class TestStringArray(base.ExtensionTests): def test_eq_with_str(self, dtype): - assert dtype == f"string[{dtype.storage}]" super().test_eq_with_str(dtype) + if dtype.na_value is pd.NA: + # only the NA-variant supports parametrized string alias + assert dtype == f"string[{dtype.storage}]" + elif dtype.storage == "pyarrow": + # TODO(infer_string) deprecate this + assert dtype == "string[pyarrow_numpy]" + def test_is_not_string_type(self, dtype): # Different from BaseDtypeTests.test_is_not_string_type # because StringDtype is a string type @@ -143,28 +149,21 @@ def _get_expected_exception( self, op_name: str, obj, other ) -> type[Exception] | None: if op_name in ["__divmod__", "__rdivmod__"]: - if isinstance(obj, pd.Series) and cast( - StringDtype, tm.get_dtype(obj) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if ( + isinstance(obj, pd.Series) + and cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow" + ): # TODO: re-raise as TypeError? return NotImplementedError - elif isinstance(other, pd.Series) and cast( - StringDtype, tm.get_dtype(other) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + elif ( + isinstance(other, pd.Series) + and cast(StringDtype, tm.get_dtype(other)).storage == "pyarrow" + ): # TODO: re-raise as TypeError? return NotImplementedError return TypeError elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": return NotImplementedError return TypeError elif op_name in ["__mul__", "__rmul__"]: @@ -178,10 +177,7 @@ def _get_expected_exception( "__sub__", "__rsub__", ]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": import pyarrow as pa # TODO: better to re-raise as TypeError? @@ -193,7 +189,7 @@ def _get_expected_exception( def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( op_name in ["min", "max"] - or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + or ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) @@ -201,10 +197,10 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: cast_to = dtype + elif dtype.na_value is np.nan: + cast_to = np.bool_ # type: ignore[assignment] elif dtype.storage == "pyarrow": cast_to = "boolean[pyarrow]" # type: ignore[assignment] - elif dtype.storage == "pyarrow_numpy": - cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" # type: ignore[assignment] return pointwise_result.astype(cast_to) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 521d2cb14ac6a..9cbbebf35b2d1 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -18,6 +18,7 @@ def test_convert_dtypes( # Just check that it works for DataFrame here if using_infer_string: string_storage = "pyarrow_numpy" + df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 387be8398e4b2..391c9361080d8 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2133,9 +2133,12 @@ def test_series_string_inference_array_string_dtype(self): tm.assert_series_equal(ser, expected) def test_series_string_inference_storage_definition(self): - # GH#54793 + # https://github.com/pandas-dev/pandas/issues/54793 + # but after PDEP-14 (string dtype), it was decided to keep dtype="string" + # returning the NA string dtype, so expected is changed from + # "string[pyarrow_numpy]" to "string[pyarrow]" pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + expected = Series(["a", "b"], dtype="string[pyarrow]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 01b49b5e5b633..e94f656fc9823 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -7,7 +7,7 @@ def _convert_na_value(ser, expected): if ser.dtype != object: - if ser.dtype.storage == "pyarrow_numpy": + if ser.dtype.na_value is np.nan: expected = expected.fillna(np.nan) else: # GH#18463 From 016726541b49a1ffe7e0b4a19e3ca64f09c2f7fa Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 20 Sep 2024 13:53:24 -0400 Subject: [PATCH 201/396] TST (string dtype): xfail all currently failing tests with future.infer_string (#59329) * TST (string dtype): xfail all currently failing tests with future.infer_string * more xfails * more xfails * add missing strict=False * also run slow and single cpu tests * fix single_cpu tests * xfail some slow tests * stop suppressing non-zero exit code from pytest on string CI build * remove accidentally added xlsx file --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 1 - ci/run_tests.sh | 6 ------ pandas/tests/apply/test_frame_apply.py | 6 ++++++ pandas/tests/apply/test_numba.py | 4 ++++ pandas/tests/apply/test_str.py | 4 ++++ pandas/tests/arrays/categorical/test_analytics.py | 3 +++ pandas/tests/arrays/categorical/test_api.py | 3 +++ .../tests/arrays/categorical/test_constructors.py | 1 + pandas/tests/arrays/floating/test_arithmetic.py | 3 +++ pandas/tests/arrays/integer/test_arithmetic.py | 3 +++ pandas/tests/arrays/masked/test_function.py | 3 +++ pandas/tests/copy_view/test_array.py | 3 +++ pandas/tests/copy_view/test_astype.py | 6 ++++++ pandas/tests/copy_view/test_constructors.py | 3 +++ pandas/tests/copy_view/test_functions.py | 8 ++++++++ pandas/tests/copy_view/test_internals.py | 3 +++ pandas/tests/copy_view/test_interp_fillna.py | 4 ++++ pandas/tests/copy_view/test_methods.py | 4 ++++ pandas/tests/copy_view/test_replace.py | 5 +++++ pandas/tests/dtypes/test_dtypes.py | 3 +++ pandas/tests/extension/test_string.py | 6 ++++++ pandas/tests/frame/indexing/test_coercion.py | 3 +++ pandas/tests/frame/indexing/test_indexing.py | 7 +++++++ pandas/tests/frame/indexing/test_insert.py | 3 +++ pandas/tests/frame/indexing/test_setitem.py | 8 ++++++++ pandas/tests/frame/indexing/test_where.py | 5 +++++ pandas/tests/frame/indexing/test_xs.py | 3 +++ pandas/tests/frame/methods/test_combine_first.py | 3 +++ pandas/tests/frame/methods/test_convert_dtypes.py | 4 ++++ pandas/tests/frame/methods/test_cov_corr.py | 3 +++ pandas/tests/frame/methods/test_dropna.py | 3 +++ pandas/tests/frame/methods/test_dtypes.py | 3 +++ pandas/tests/frame/methods/test_fillna.py | 2 ++ pandas/tests/frame/methods/test_info.py | 5 +++++ pandas/tests/frame/methods/test_quantile.py | 8 ++++++++ pandas/tests/frame/methods/test_replace.py | 3 +++ pandas/tests/frame/methods/test_reset_index.py | 3 +++ pandas/tests/frame/methods/test_to_csv.py | 7 +++++++ .../tests/frame/methods/test_to_dict_of_blocks.py | 3 +++ pandas/tests/frame/test_arithmetic.py | 3 +++ pandas/tests/frame/test_arrow_interface.py | 4 ++++ pandas/tests/frame/test_block_internals.py | 5 +++++ pandas/tests/frame/test_constructors.py | 1 + pandas/tests/frame/test_query_eval.py | 3 +++ pandas/tests/frame/test_reductions.py | 4 ++++ pandas/tests/frame/test_stack_unstack.py | 4 ++++ pandas/tests/frame/test_unary.py | 3 +++ pandas/tests/groupby/aggregate/test_aggregate.py | 4 ++++ pandas/tests/groupby/aggregate/test_cython.py | 4 ++++ pandas/tests/groupby/aggregate/test_other.py | 3 +++ pandas/tests/groupby/methods/test_describe.py | 4 ++++ pandas/tests/groupby/methods/test_nth.py | 3 +++ pandas/tests/groupby/methods/test_quantile.py | 4 ++++ pandas/tests/groupby/methods/test_size.py | 3 +++ pandas/tests/groupby/methods/test_value_counts.py | 4 ++++ pandas/tests/groupby/test_categorical.py | 3 +++ pandas/tests/groupby/test_groupby.py | 6 ++++++ pandas/tests/groupby/test_groupby_dropna.py | 3 +++ pandas/tests/groupby/test_grouping.py | 4 ++++ pandas/tests/groupby/test_pipe.py | 4 ++++ pandas/tests/groupby/test_raises.py | 4 ++++ pandas/tests/groupby/test_reductions.py | 3 +++ pandas/tests/groupby/test_timegrouper.py | 3 +++ pandas/tests/groupby/transform/test_transform.py | 6 ++++++ pandas/tests/indexes/base_class/test_setops.py | 3 +++ pandas/tests/indexes/test_old_base.py | 1 + pandas/tests/indexing/test_iloc.py | 3 +++ pandas/tests/indexing/test_indexing.py | 3 +++ pandas/tests/indexing/test_loc.py | 1 + pandas/tests/interchange/test_impl.py | 4 ++++ pandas/tests/io/excel/test_readers.py | 1 + pandas/tests/io/excel/test_writers.py | 3 +++ pandas/tests/io/formats/style/test_to_latex.py | 3 +++ pandas/tests/io/json/test_pandas.py | 9 +++++++++ pandas/tests/io/parser/common/test_chunksize.py | 3 +++ pandas/tests/io/parser/common/test_common_basic.py | 3 +++ .../tests/io/parser/common/test_file_buffer_url.py | 3 +++ pandas/tests/io/parser/common/test_index.py | 3 +++ pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 7 +++++++ pandas/tests/io/parser/test_c_parser_only.py | 3 +++ pandas/tests/io/parser/test_converters.py | 3 +++ pandas/tests/io/parser/test_mangle_dupes.py | 3 +++ pandas/tests/io/parser/test_na_values.py | 6 ++++++ pandas/tests/io/parser/test_parse_dates.py | 5 +++++ pandas/tests/io/parser/test_python_parser_only.py | 4 ++++ pandas/tests/io/parser/test_read_fwf.py | 3 +++ pandas/tests/io/parser/test_upcast.py | 3 +++ pandas/tests/io/pytables/test_append.py | 7 ++++++- pandas/tests/io/pytables/test_categorical.py | 7 ++++++- pandas/tests/io/pytables/test_complex.py | 6 ++++++ pandas/tests/io/pytables/test_errors.py | 7 ++++++- pandas/tests/io/pytables/test_file_handling.py | 7 ++++++- pandas/tests/io/pytables/test_put.py | 7 ++++++- pandas/tests/io/pytables/test_read.py | 7 ++++++- pandas/tests/io/pytables/test_round_trip.py | 7 ++++++- pandas/tests/io/pytables/test_select.py | 7 ++++++- pandas/tests/io/pytables/test_store.py | 7 ++++++- pandas/tests/io/pytables/test_timezones.py | 6 ++++++ pandas/tests/io/sas/test_sas7bdat.py | 6 ++++++ pandas/tests/io/test_clipboard.py | 6 ++++++ pandas/tests/io/test_common.py | 7 +++++++ pandas/tests/io/test_compression.py | 3 +++ pandas/tests/io/test_feather.py | 4 ++++ pandas/tests/io/test_fsspec.py | 4 ++++ pandas/tests/io/test_gcs.py | 3 +++ pandas/tests/io/test_html.py | 6 ++++++ pandas/tests/io/test_http_headers.py | 3 +++ pandas/tests/io/test_orc.py | 11 ++++++++--- pandas/tests/io/test_parquet.py | 6 +++++- pandas/tests/io/test_sql.py | 11 ++++++++--- pandas/tests/io/test_stata.py | 13 +++++++++++++ pandas/tests/io/xml/test_xml.py | 3 +++ pandas/tests/io/xml/test_xml_dtypes.py | 4 ++++ pandas/tests/reductions/test_reductions.py | 5 +++++ pandas/tests/resample/test_resampler_grouper.py | 3 +++ pandas/tests/reshape/concat/test_concat.py | 3 +++ pandas/tests/reshape/merge/test_merge_asof.py | 3 +++ pandas/tests/reshape/test_from_dummies.py | 3 +++ pandas/tests/reshape/test_melt.py | 10 ++++++++++ pandas/tests/reshape/test_pivot.py | 2 ++ pandas/tests/reshape/test_union_categoricals.py | 3 +++ pandas/tests/series/accessors/test_dt_accessor.py | 6 ++++++ pandas/tests/series/indexing/test_indexing.py | 3 +++ pandas/tests/series/indexing/test_setitem.py | 6 ++++++ pandas/tests/series/methods/test_info.py | 3 +++ pandas/tests/series/methods/test_replace.py | 1 + pandas/tests/series/methods/test_to_csv.py | 3 +++ pandas/tests/series/methods/test_unstack.py | 3 +++ pandas/tests/series/test_arithmetic.py | 3 +++ pandas/tests/series/test_logical_ops.py | 3 +++ pandas/tests/test_algos.py | 4 ++++ 131 files changed, 539 insertions(+), 23 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 7b6f4e152f3a3..3ba61e39316af 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -87,7 +87,6 @@ jobs: pandas_copy_on_write: "warn" - name: "Future infer strings" env_file: actions-311.yaml - pattern: "not slow and not network and not single_cpu" pandas_future_infer_string: "1" - name: "Pypy" env_file: actions-pypy-39.yaml diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 9b48778c41804..39ab0890a32d1 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -16,11 +16,5 @@ if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi -# temporarily let pytest always succeed (many tests are not yet passing in the -# build enabling the future string dtype) -if [[ "$PANDAS_FUTURE_INFER_STRING" == "1" ]]; then - PYTEST_CMD="$PYTEST_CMD || true" -fi - echo $PYTEST_CMD sh -c "$PYTEST_CMD" diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index b7eac6b8f0ea1..a774ae214e09a 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -61,6 +63,7 @@ def test_apply(float_frame, engine, request): assert result.index is float_frame.index +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) def test_apply_args(float_frame, axis, raw, engine, request): @@ -1169,6 +1172,7 @@ def test_agg_with_name_as_column_name(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_agg_multiple_mixed(): # GH 20909 mdf = DataFrame( @@ -1286,6 +1290,7 @@ def test_agg_reduce(axis, float_frame): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nuiscance_columns(): # GH 15015 df = DataFrame( @@ -1462,6 +1467,7 @@ def test_apply_datetime_tz_issue(engine, request): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) def test_mixed_column_raises(df, method, using_infer_string): diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 57b81711ddb48..aee9100702350 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -17,6 +19,7 @@ def apply_axis(request): return request.param +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x result = float_frame.apply(func, engine="numba", axis=apply_axis) @@ -40,6 +43,7 @@ def test_numba_vs_python_string_index(): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_indexing(): frame = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 17e8322dc40e1..8956aed5e9ceb 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import is_number from pandas import ( @@ -86,6 +88,7 @@ def test_apply_np_transformer(float_frame, op, how): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -144,6 +147,7 @@ def test_agg_cython_table_series(series, func, expected): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index c2c53fbc4637e..7c7a236ab83cd 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PYPY from pandas import ( @@ -296,6 +298,7 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_memory_usage(self): cat = Categorical([1, 2, 3]) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index a939ee5f6f53f..1d948b7495a43 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PY311 from pandas import ( @@ -156,6 +158,7 @@ def test_reorder_categories_raises(self, new_categories): with pytest.raises(ValueError, match=msg): cat.reorder_categories(new_categories) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 6813683cb5219..60d78a906b528 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -743,6 +743,7 @@ def test_interval(self): tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: arr = pd.arrays.StringArray._from_sequence( diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index ba081bd01062a..768d3c1449fa4 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import FloatingArray @@ -122,6 +124,7 @@ def test_arith_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 8acd298f37a07..8aa8c2db940b4 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core import ops @@ -172,6 +174,7 @@ def test_numpy_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index b259018cd6121..81338bca460a6 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -58,6 +60,7 @@ def test_tolist(data): tm.assert_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_numpy(): # GH#56991 diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 9a3f83e0293f5..5d0efdc149004 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Series, @@ -157,6 +159,7 @@ def test_dataframe_array_ea_dtypes(using_copy_on_write): assert arr.flags.writeable is True +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dataframe_array_string_dtype(using_copy_on_write, using_array_manager): df = DataFrame({"a": ["a", "b"]}, dtype="string") arr = np.asarray(df) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index d462ce3d3187d..14fd8fb5f911e 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -96,6 +98,7 @@ def test_astype_numpy_to_ea(): assert np.shares_memory(get_array(ser), get_array(result)) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -113,6 +116,7 @@ def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -221,6 +225,7 @@ def test_astype_arrow_timestamp(using_copy_on_write): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_infer_objects(using_copy_on_write): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() @@ -240,6 +245,7 @@ def test_convert_dtypes_infer_objects(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes(using_copy_on_write): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 1aa458a625028..866b1964a334f 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -283,6 +285,7 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype): assert np.shares_memory(arr_before, arr_after) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cons", [Series, Index]) @pytest.mark.parametrize( "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 56e4b186350f2..4ec1d023c8ba7 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -12,6 +14,7 @@ from pandas.tests.copy_view.util import get_array +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_frames(using_copy_on_write): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -36,6 +39,7 @@ def test_concat_frames(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_frames_updating_input(using_copy_on_write): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -197,6 +201,7 @@ def test_concat_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ @@ -260,6 +265,7 @@ def test_merge_on_index(using_copy_on_write): tm.assert_frame_equal(df2, df2_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "func, how", [ @@ -313,6 +319,7 @@ def test_merge_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_join_on_key(using_copy_on_write): df_index = Index(["a", "b", "c"], name="key") @@ -346,6 +353,7 @@ def test_join_on_key(using_copy_on_write): tm.assert_frame_equal(df2, df2_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_join_multiple_dataframes_on_key(using_copy_on_write): df_index = Index(["a", "b", "c"], name="key") diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index a727331307d7e..6f7198520d22e 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -76,6 +78,7 @@ def test_switch_options(): @td.skip_array_manager_invalid_test +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [np.intp, np.int8]) @pytest.mark.parametrize( "locs, arr", diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index ddc5879a56d54..338b76cbf1e7a 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( NA, ArrowDtype, @@ -135,6 +137,7 @@ def test_interp_fill_functions_inplace( assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_interpolate_cleaned_fill_method(using_copy_on_write): # Check that "method is set to None" case works correctly df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) @@ -156,6 +159,7 @@ def test_interpolate_cleaned_fill_method(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_interpolate_object_convert_no_op(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) arr_a = get_array(df, "a") diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 5d1eefccbb1e7..d870342ef9e29 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SettingWithCopyWarning import pandas as pd @@ -950,6 +952,7 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_infer_objects(using_copy_on_write): df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) df_orig = df.copy() @@ -1177,6 +1180,7 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("decimals", [-1, 0, 1]) def test_round(using_copy_on_write, warn_copy_on_write, decimals): df = DataFrame({"a": [1, 2], "b": "c"}) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 0beac439fbb58..32da870c6d2e3 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -10,6 +12,7 @@ from pandas.tests.copy_view.util import get_array +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "replace_kwargs", [ @@ -63,6 +66,7 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): assert np.shares_memory(arr, get_array(df, "a")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_regex_inplace(using_copy_on_write): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") @@ -350,6 +354,7 @@ def test_replace_empty_list(using_copy_on_write): assert not df2._mgr._has_no_reference(0) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(using_copy_on_write, value): df = DataFrame({"a": ["a", "b", "c"]}) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index de1ddce724a5b..e522d2666a2dc 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -5,6 +5,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas.core.dtypes.base import _registry as registry @@ -959,6 +961,7 @@ def test_same_categories_different_order(self): c2 = CategoricalDtype(["b", "a"], ordered=True) assert c1 is not c2 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("ordered1", [True, False, None]) @pytest.mark.parametrize("ordered2", [True, False, None]) def test_categorical_equality(self, ordered1, ordered2): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index f3eec142c1968..5a72b2244d2bf 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -21,6 +21,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -28,6 +30,10 @@ from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def maybe_split_array(arr, chunked): if not chunked: diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index ba0d8613b6228..9d20821ae8bc6 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -97,6 +99,7 @@ def test_6942(indexer_al): assert df.iloc[0, 0] == t2 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_26395(indexer_al): # .at case fixed by GH#45121 (best guess) df = DataFrame(index=["A", "B", "C"]) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 22d9c7f26a57c..09f359df37dd1 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import iNaT from pandas.errors import ( InvalidIndexError, @@ -180,6 +182,7 @@ def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_fram if bif[c].dtype != bifw[c].dtype: assert bif[c].dtype == df[c].dtype + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_getitem_boolean_casting(self, datetime_frame): # don't upcast if we don't need to df = datetime_frame.copy() @@ -515,6 +518,7 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_None(self, float_frame, using_infer_string): # GH #766 float_frame[None] = float_frame["A"] @@ -1181,6 +1185,7 @@ def test_setitem_with_unaligned_tz_aware_datetime_column(self): df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] tm.assert_series_equal(df["dates"], column) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_loc_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT @@ -1203,6 +1208,7 @@ def test_loc_setitem_datetimelike_with_inference(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { @@ -1956,6 +1962,7 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: tm.assert_frame_equal(df, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_new_column_infer_string(): # GH#55366 pytest.importorskip("pyarrow") diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 7e702bdc993bd..82b75459f08d0 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import PerformanceWarning from pandas import ( @@ -60,6 +62,7 @@ def test_insert_column_bug_4032(self): expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_insert_with_columns_dups(self): # GH#14291 df = DataFrame() diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index a58dd701f0f22..bce3cb5dacabe 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas.core.dtypes.base import _registry as ea_registry @@ -146,6 +148,7 @@ def test_setitem_different_dtype(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_empty_columns(self): # GH 13522 df = DataFrame(index=["A", "B", "C"]) @@ -161,6 +164,7 @@ def test_setitem_dt64_index_empty_columns(self): df["A"] = rng assert df["A"].dtype == np.dtype("M8[ns]") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_timestamp_empty_columns(self): # GH#19843 df = DataFrame(index=range(3)) @@ -200,6 +204,7 @@ def test_setitem_with_unaligned_sparse_value(self): expected = Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_period_preserves_dtype(self): # GH: 26861 data = [Period("2003-12", "D")] @@ -672,6 +677,7 @@ def test_setitem_iloc_two_dimensional_generator(self): expected = DataFrame({"a": [1, 2, 3], "b": [4, 1, 1]}) tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_dtypes_bytes_type_to_object(self): # GH 20734 index = Series(name="id", dtype="S24") @@ -706,6 +712,7 @@ def test_setitem_ea_dtype_rhs_series(self): # TODO(ArrayManager) set column with 2d column array, see #44788 @td.skip_array_manager_not_yet_implemented + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_npmatrix_2d(self): # GH#42376 # for use-case df["x"] = sparse.random((10, 10)).mean(axis=1) @@ -929,6 +936,7 @@ def test_setitem_with_expansion_categorical_dtype(self): ser.name = "E" tm.assert_series_equal(result2.sort_index(), ser.sort_index()) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_scalars_no_index(self): # GH#16823 / GH#17894 df = DataFrame() diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 3d36d0471f02f..dfbc3b4ca33ad 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -46,6 +48,7 @@ def is_ok(s): class TestDataFrameIndexingWhere: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_get(self, where_frame, float_string_frame): def _check_get(df, cond, check_dtypes=True): other1 = _safe_add(df) @@ -97,6 +100,7 @@ def test_where_upcasting(self): tm.assert_series_equal(result, expected) @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): @@ -172,6 +176,7 @@ def test_where_invalid(self): df.mask(0) @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): # where inplace diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index be809e3a17c8e..4ca435fa5acc5 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SettingWithCopyError from pandas import ( @@ -77,6 +79,7 @@ def test_xs( else: assert (expected == 5).all() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_xs_corner(self): # pathological mixed-type reordering case df = DataFrame(index=[0]) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 8aeab5dacd8b4..a4ee0b08e1e66 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import is_dtype_equal @@ -30,6 +32,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_combine_first(self, float_frame, using_infer_string): # disjoint head, tail = float_frame[:5], float_frame[5:] diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 9cbbebf35b2d1..91fa81b5bee2e 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm @@ -181,6 +183,7 @@ def test_convert_dtypes_pyarrow_timestamp(self): result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_avoid_block_splitting(self): # GH#55341 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) @@ -195,6 +198,7 @@ def test_convert_dtypes_avoid_block_splitting(self): tm.assert_frame_equal(result, expected) assert result._mgr.nblocks == 2 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_from_arrow(self): # GH#56581 df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 04a08c8b9bc52..721ec4e43eb1b 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -326,6 +328,7 @@ def test_corrwith(self, datetime_frame, dtype): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_corrwith_with_objects(self, using_infer_string): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 7899b4aeac3fd..87a43b4e67c3f 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -182,6 +184,7 @@ def test_dropna_multiple_axes(self): with pytest.raises(TypeError, match="supplying multiple axes"): inp.dropna(how="all", axis=(0, 1), inplace=True) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dropna_tz_aware_datetime(self): # GH13407 df = DataFrame() diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index ab632ac17318e..2556c44e63a77 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -142,6 +144,7 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index e2baa2567f5b4..d767e35878b52 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -126,6 +126,7 @@ def test_fillna_empty(self, using_copy_on_write): df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_different_dtype(self, using_infer_string): # with different dtype (GH#3386) df = DataFrame( @@ -370,6 +371,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_dtype_conversion(self, using_infer_string): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index c3d02a07f397e..475632667a87a 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( IS64, PYPY, @@ -433,6 +435,7 @@ def test_usage_via_getsizeof(): assert abs(diff) < 100 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_info_memory_usage_qualified(): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) @@ -493,6 +496,7 @@ def test_info_categorical(): df.info(buf=buf) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") def test_info_int_columns(): # GH#37245 @@ -516,6 +520,7 @@ def test_info_int_columns(): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 0f27eae1a3bfc..ec070467b242e 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -352,6 +354,7 @@ def test_quantile_multi_empty(self, interp_method): ) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_datetime(self, unit): dti = pd.to_datetime(["2010", "2011"]).as_unit(unit) df = DataFrame({"a": dti, "b": [0, 5]}) @@ -405,6 +408,7 @@ def test_quantile_datetime(self, unit): expected = DataFrame(index=[0.5], columns=[]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [ @@ -675,6 +679,7 @@ def test_quantile_nat(self, interp_method, request, using_array_manager, unit): ) tm.assert_frame_equal(res, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_empty_no_rows_floats(self, interp_method): interpolation, method = interp_method @@ -913,6 +918,7 @@ def test_quantile_ea_scalar(self, request, obj, index): else: tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype, expected_data, expected_index, axis", [ @@ -931,6 +937,7 @@ def test_empty_numeric(self, dtype, expected_data, expected_index, axis): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype, expected_data, expected_index, axis, expected_dtype", [ @@ -949,6 +956,7 @@ def test_empty_datelike( ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "expected_data, expected_index, axis", [ diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 0884c091ba96a..fd8039975a514 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -624,6 +624,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_mixed2(self, using_infer_string): # to object block upcasting df = DataFrame( @@ -1443,6 +1444,7 @@ def test_replace_ea_ignore_float(self, frame_or_series, value): result = obj.replace(1.0, 0.0) tm.assert_equal(expected, result) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1538,6 +1540,7 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_intervals(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index fbf36dbc4fb02..44d7bbf57fe0a 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -644,6 +646,7 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): tm.assert_frame_equal(res, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "array, dtype", [ diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 250567eafc670..bed8b030bc72a 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserError import pandas as pd @@ -420,6 +422,7 @@ def test_to_csv_empty(self): result, expected = self._return_result_expected(df, 1000) tm.assert_frame_equal(result, expected, check_column_type=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.slow def test_to_csv_chunksize(self): chunksize = 1000 @@ -432,6 +435,7 @@ def test_to_csv_chunksize(self): result, expected = self._return_result_expected(df, chunksize, rnlvl=2) tm.assert_frame_equal(result, expected, check_names=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] @@ -528,6 +532,7 @@ def test_to_csv_headers(self): assert return_value is None tm.assert_frame_equal(to_df, recons) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_multiindex(self, float_frame, datetime_frame): frame = float_frame old_index = frame.index @@ -721,6 +726,7 @@ def test_to_csv_withcommas(self): df2 = self.read_csv(path) tm.assert_frame_equal(df2, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_mixed(self): def create_cols(name): return [f"{name}{i:03d}" for i in range(5)] @@ -810,6 +816,7 @@ def test_to_csv_dups_cols(self): result.columns = df.columns tm.assert_frame_equal(result, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_dups_cols2(self): # GH3457 df = DataFrame( diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index f64cfd5fe6a2d..42858aa412810 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -35,6 +37,7 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): assert _last_df is not None and not _last_df[column].equals(df[column]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_dict_of_blocks_item_cache(using_copy_on_write, warn_copy_on_write): # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index f68785a354d7e..4e32dc0cb3f98 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -1560,6 +1562,7 @@ def test_comparisons(self, simple_frame, float_frame, func): with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): # GH 11565 df = DataFrame( diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index 098d1829b973c..dc163268f64b9 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -2,6 +2,8 @@ import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -9,6 +11,7 @@ pa = pytest.importorskip("pyarrow") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="14.0") def test_dataframe_arrow_interface(): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) @@ -31,6 +34,7 @@ def test_dataframe_arrow_interface(): assert table.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="15.0") def test_dataframe_to_arrow(): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 712494ef15f97..9bd61736624ca 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td @@ -183,6 +185,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_mixed(self, float_string_frame, using_infer_string): # test construction edge cases with mixed types @@ -214,6 +217,7 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64; as of 2.0 this does # *not* convert @@ -434,6 +438,7 @@ def test_update_inplace_sets_valid_block_values(using_copy_on_write): assert df.isnull().sum().sum() == 0 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nonconsolidated_item_cache_take(): # https://github.com/pandas-dev/pandas/issues/35521 diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b45eca127b3e4..aab900f6eef47 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1963,6 +1963,7 @@ def test_constructor_with_datetimes4(self): df = DataFrame({"value": dr}) assert str(df.iat[0, 0].tz) == "US/Eastern" + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_constructor_with_datetimes5(self): # GH 7822 # preserver an index with a tz on dict construction diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 2c807c72582c5..13232a0909c5b 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( NumExprClobberingError, UndefinedVariableError, @@ -726,6 +728,7 @@ def test_inf(self, op, f, engine, parser): result = df.query(q, engine=engine, parser=parser) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_check_tz_aware_index_query(self, tz_aware_fixture): # https://github.com/pandas-dev/pandas/issues/29463 tz = tz_aware_fixture diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 45d06c56d353f..db15461ba0234 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -608,6 +608,7 @@ def test_sem(self, datetime_frame): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, expected", [ @@ -1068,6 +1069,7 @@ def test_sum_bools(self): # ---------------------------------------------------------------------- # Index of max / min + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmin(self, float_frame, int_frame, skipna, axis): @@ -1117,6 +1119,7 @@ def test_idxmin_axis_2(self, float_frame): with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax(self, float_frame, int_frame, skipna, axis): @@ -1359,6 +1362,7 @@ def test_any_all_extra(self): result = df[["C"]].all(axis=None).item() assert result is True + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index d8b92091260a3..75ef348b75deb 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.errors import PerformanceWarning @@ -1646,6 +1648,7 @@ def test_unstack_multiple_no_empty_columns(self): expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) @@ -1889,6 +1892,7 @@ def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack): expected = frame.stack(future_stack=future_stack) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 850c92013694f..6f7453d0d1655 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -126,6 +128,7 @@ def test_pos_object(self, df): tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "df", [ diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 6223a153df358..b267347aaf030 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError from pandas.core.dtypes.common import is_integer_dtype @@ -333,6 +335,7 @@ def aggfun_1(ser): assert len(result) == 0 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) @@ -1098,6 +1101,7 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_aggregate_mixed_types(): # GH 16916 df = DataFrame( diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 5c99882cef6d2..fbbace54a3444 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -93,6 +95,7 @@ def test_cython_agg_boolean(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_agg_nothing_to_agg(): frame = DataFrame( {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} @@ -161,6 +164,7 @@ def test_cython_agg_return_dict(): tm.assert_series_equal(ts, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 00136e572288e..35ee6c388d5a8 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError import pandas as pd @@ -306,6 +308,7 @@ def test_series_agg_multikey(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_series_agg_multi_pure_python(): data = DataFrame( { diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index a2440e09dfc02..34b046bff7c91 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -71,6 +73,7 @@ def test_series_describe_as_index(as_index, keys): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_describe_multikey(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() @@ -272,6 +275,7 @@ def test_describe(self, df, gb, gni): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [int, float, object]) @pytest.mark.parametrize( "kwargs", diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index a8ed9e9d52021..344258257ba80 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -704,6 +706,7 @@ def test_first_multi_key_groupby_categorical(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("method", ["first", "last", "nth"]) def test_groupby_last_first_nth_with_none(method, nulls_fixture): # GH29645 diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 361a8c27fbf9d..d3bc815402ade 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -168,6 +170,7 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) @@ -250,6 +253,7 @@ def test_groupby_quantile_nullable_array(values, q): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 93a4e743d0d71..5b4c08fc24411 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -108,6 +110,7 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 8e25177368d8b..dcc0b39f0006c 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -285,6 +287,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("groupby", ["column", "array", "function"]) @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) @pytest.mark.parametrize( @@ -372,6 +375,7 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f60ff65536f20..1073dda954563 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -338,6 +340,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 44d6340e55507..4381b36b0b73a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( PerformanceWarning, SpecificationError, @@ -1597,6 +1599,7 @@ def test_groupby_two_group_keys_all_nan(): assert result == {} +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] @@ -2691,6 +2694,7 @@ def test_groupby_all_nan_groups_drop(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_empty_multi_column(as_index, numeric_only): # GH 15106 & GH 41998 @@ -2707,6 +2711,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_aggregation_non_numeric_dtype(): # GH #43108 df = DataFrame( @@ -2864,6 +2869,7 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_none_column_name(): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 9155f2cccf117..9c01e017dd29c 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -97,6 +99,7 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, idx, outputs", [ diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index d763b67059375..6e3ae2f7d8fae 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( CategoricalIndex, @@ -842,6 +844,7 @@ def test_groupby_empty(self): expected = ["name"] assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_level_index_value_all_na(self): # issue 20519 df = DataFrame( @@ -986,6 +989,7 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_single_column(self): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 7d5c1625b8ab4..1044c83e3e56b 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -1,4 +1,7 @@ import numpy as np +import pytest + +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -8,6 +11,7 @@ import pandas._testing as tm +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 0b451ce73db89..64780d0ba03d8 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -117,6 +119,7 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): gb.transform(groupby_func, *args) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( how, by, groupby_series, groupby_func, df_with_string_col @@ -216,6 +219,7 @@ def func(x): getattr(gb, how)(func) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 25b0f80639cff..f67051de6e8c7 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import iNaT from pandas.core.dtypes.common import pandas_dtype @@ -455,6 +457,7 @@ def test_max_min_non_numeric(): assert "ss" in result +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_max_min_object_multiple_columns(using_array_manager): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 8ef7c2b8ce859..69542b934e65f 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -10,6 +10,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -73,6 +75,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_timegrouper(self): # GH 4161 # TimeGrouper requires a sorted index diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index fd9bd5cc55538..a5433d5496b0b 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -2,6 +2,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.core.dtypes.common import ensure_platform_int @@ -497,6 +499,7 @@ def test_transform_select_columns(df): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_transform_nuisance_raises(df): # case that goes through _transform_item_by_item @@ -579,6 +582,7 @@ def test_transform_coercion(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_with_int(): # GH 3740, make sure that we might upcast on item-by-item transform @@ -846,6 +850,7 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "op, args, targop", @@ -1219,6 +1224,7 @@ def test_groupby_transform_with_datetimes(func, values): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_dtype(): # GH 22243 df = DataFrame({"a": [1], "val": [1.35]}) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 3ef3f3ad4d3a2..2176aa52b17f4 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Index, @@ -231,6 +233,7 @@ def test_tuple_union_bug(self, method, expected, sort): expected = Index(expected) tm.assert_index_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("first_list", [["b", "a"], []]) @pytest.mark.parametrize("second_list", [["a", "b"], []]) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index bfbb0c3dda5c5..0e6722ff50d1e 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -846,6 +846,7 @@ def test_append_preserves_dtype(self, simple_index): alt = index.take(list(range(N)) * 2) tm.assert_index_equal(result, alt, check_exact=True) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_inv(self, simple_index, using_infer_string): idx = simple_index diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 43dd3812e8b7d..3fd9498e21a73 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -1216,6 +1218,7 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr[2] = arr[-1] assert ser[0] == arr[-1] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_iloc_setitem_multicolumn_to_datetime(self): # GH#20511 df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 162c2c58ff744..d2c8454019a5e 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -561,6 +563,7 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d33719f3e2115..41431c0e2813b 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -642,6 +642,7 @@ def test_loc_setitem_consistency_empty(self): expected["x"] = expected["x"].astype(np.int64) tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_loc_setitem_consistency_slice_column_len(self): # .loc[:,column] setting with slice == len of the column # GH10408 diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 25418b8bb2b37..d1a15dc93f702 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import iNaT from pandas.compat import ( is_ci_environment, @@ -418,6 +420,7 @@ def test_empty_string_column(): tm.assert_frame_equal(df, result) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") @@ -434,6 +437,7 @@ def test_non_str_names(): assert names == ["0"] +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_non_str_names_w_duplicates(): # https://github.com/pandas-dev/pandas/issues/56701 df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8d36dc7520019..a83176cfe28f8 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -595,6 +595,7 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(self, read_ext, dtype_backend, engine): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 292eab2d88152..7ecddb18a61ec 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows from pandas.compat._constants import PY310 from pandas.compat._optional import import_optional_dependency @@ -1311,6 +1313,7 @@ def test_freeze_panes(self, path): result = pd.read_excel(path, index_col=0) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_path_lib(self, engine, ext): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py index 7f1443c3ee66b..b29c880d1f823 100644 --- a/pandas/tests/io/formats/style/test_to_latex.py +++ b/pandas/tests/io/formats/style/test_to_latex.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, MultiIndex, @@ -729,6 +731,7 @@ def test_longtable_caption_label(styler, caption, cap_exp, label, lab_exp): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize( "columns, siunitx", diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 2157498aea95e..99c45c61fc8a4 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -208,6 +208,7 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): assert_json_roundtrip_equal(result, expected, orient) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): @@ -285,6 +286,7 @@ def test_roundtrip_empty(self, orient, convert_axes): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter @@ -712,6 +714,7 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = StringIO(object_series.to_json(orient=orient)) @@ -816,6 +819,7 @@ def test_path(self, float_frame, int_frame, datetime_frame): df.to_json(path) read_json(path) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_axis_dates(self, datetime_series, datetime_frame): # frame json = StringIO(datetime_frame.to_json()) @@ -828,6 +832,7 @@ def test_axis_dates(self, datetime_series, datetime_frame): tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dates(self, datetime_series, datetime_frame): # frame df = datetime_frame @@ -898,6 +903,7 @@ def test_convert_dates_infer(self, infer_word): result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "date,date_unit", [ @@ -958,6 +964,7 @@ def test_date_format_series_raises(self, datetime_series): with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_date_unit(self, unit, datetime_frame): df = datetime_frame @@ -1063,6 +1070,7 @@ def test_round_trip_exception(self, datapath): res = res.fillna(np.nan, downcast=False) tm.assert_frame_equal(res, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.network @pytest.mark.single_cpu @pytest.mark.parametrize( @@ -2027,6 +2035,7 @@ def test_json_uint64(self): result = df.to_json(orient="split") assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "orient", ["split", "records", "values", "index", "columns"] ) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 9f42cf674b0a7..7b70601addcad 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import parsers as libparsers from pandas.errors import DtypeWarning @@ -228,6 +230,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 7ffc49e941c14..95a7664304889 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -12,6 +12,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( EmptyDataError, ParserError, @@ -915,6 +917,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index a7a8d031da215..1f5021c8a24cc 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -14,6 +14,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( EmptyDataError, ParserError, @@ -67,6 +69,7 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 038c684c90c9e..0121af53f1aa4 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -8,6 +8,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -85,6 +87,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index ce02e752fb90b..9c1ae6f9b4236 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserWarning import pandas as pd @@ -54,6 +56,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_per_column(all_parsers): parser = all_parsers @@ -301,6 +304,7 @@ def test_true_values_cast_to_bool(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): @@ -316,6 +320,7 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_mangle_dup_cols_single_dtype(all_parsers): # GH#42022 @@ -458,6 +463,7 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -501,6 +507,7 @@ def test_dtype_backend_ea_dtype_specified(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_pyarrow(all_parsers, request): # GH#36712 pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 27d7bc0bb6c07..1501479510e17 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,6 +17,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, @@ -183,6 +185,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtypes(c_parser_only): parser = c_parser_only data = """\ diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 7f3e45324dbd2..a3c6dc8fd0898 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -184,6 +186,7 @@ def convert_score(x): tm.assert_frame_equal(results[0], results[1]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("conv_f", [lambda x: x, str]) def test_converter_index_col_bug(all_parsers, conv_f): # see gh-1835 , GH#40589 diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 1d245f81f027c..32a8d3b81f470 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -7,6 +7,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas import DataFrame import pandas._testing as tm @@ -118,6 +120,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index ca106fa772e82..1a3b7b37bf66b 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.parsers import STR_NA_VALUES from pandas import ( @@ -258,6 +260,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "kwargs,expected", [ @@ -432,6 +435,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", @@ -626,6 +630,7 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): @@ -677,6 +682,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): # TODO: this test isn't about the na_values keyword, it is about the empty entries # being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 623657b412682..be2015fca27d1 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -16,6 +16,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + from pandas._libs.tslibs import parsing import pandas as pd @@ -1797,6 +1799,7 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", @@ -2051,6 +2054,7 @@ def test_parse_dates_and_keep_original_column(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dayfirst_warnings(): # GH 12585 @@ -2200,6 +2204,7 @@ def test_parse_dates_and_string_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_parse_dot_separated_dates(all_parsers): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index dbd474c6ae0b9..9e7530906afa3 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -17,6 +17,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( ParserError, ParserWarning, @@ -496,6 +498,7 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] ) @@ -523,6 +526,7 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype,expected", [ diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index bed2b5e10a6f7..53426bebaa70b 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -14,6 +14,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import EmptyDataError import pandas as pd @@ -966,6 +968,7 @@ def test_widths_and_usecols(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(string_storage, dtype_backend): # GH#50289 if string_storage == "python": diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index bc4c4c2e24e9c..d8c40670afcbd 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.parsers import ( _maybe_upcast, na_values, @@ -84,6 +86,7 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 00a81a4f1f385..93e50455fe6a2 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp import pandas.util._test_decorators as td @@ -23,7 +25,10 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] tables = pytest.importorskip("tables") diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 58ebdfe7696b4..07c797467e5e2 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -14,7 +16,10 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_categorical(setup_path): diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index c5cac5a5caf09..d140cfc941e16 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -11,6 +13,10 @@ from pandas.io.pytables import read_hdf +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def test_complex_fixed(tmp_path, setup_path): df = DataFrame( diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 2021101098892..c31b9989ef35e 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( CategoricalIndex, DataFrame, @@ -22,7 +24,10 @@ _maybe_adjust_name, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_pass_spec_to_storer(setup_path): diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index d93de16816725..9daf2a5910a08 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( PY311, is_ci_environment, @@ -32,7 +34,10 @@ from pandas.io import pytables from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index bc5f046b7fa33..f84a3ebfeb54a 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp import pandas as pd @@ -22,7 +24,10 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_format_type(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index e4a3ea1fc9db8..a04f02f0e052b 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -26,7 +28,10 @@ from pandas.io.pytables import TableIterator -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_read_missing_key_close_store(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 4ba9787a5a6b9..2397d18b1019e 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -24,7 +26,10 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_conv_read_write(): diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 0e303d1c890c5..9f403f8293aed 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp import pandas as pd @@ -24,7 +26,10 @@ from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_select_columns_in_where(setup_path): diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 82d3052e7f5d6..8a33cccf62fcf 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -31,7 +33,10 @@ read_hdf, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] tables = pytest.importorskip("tables") diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index c5613daf62207..05d630dc0e47c 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs.timezones import maybe_get_tz import pandas.util._test_decorators as td @@ -23,6 +25,10 @@ ensure_clean_store, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index b71896c77ffb5..493971f9f56ef 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import IS64 from pandas.errors import EmptyDataError import pandas.util._test_decorators as td @@ -16,6 +18,10 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture def dirpath(datapath): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 3c0208fcc74ec..da998f058471c 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -28,6 +30,10 @@ init_qt_clipboard, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def build_kwargs(sep, excel): kwargs = {} diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index e51f86563081b..56707560c2fda 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -152,6 +154,7 @@ def test_bytesiowrapper_returns_correct_bytes(self): assert result == data.encode("utf-8") # Test that pyarrow can handle a file opened with get_handle + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_handle_pyarrow_compat(self): pa_csv = pytest.importorskip("pyarrow.csv") @@ -347,6 +350,7 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), ], ) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_write_fspath_all(self, writer_name, writer_kwargs, module): if writer_name in ["to_latex"]: # uses Styler implementation pytest.importorskip("jinja2") @@ -450,6 +454,7 @@ def test_unknown_engine(self): with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_binary_mode(self): """ 'encoding' shouldn't be passed to 'open' in binary mode. @@ -508,6 +513,7 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): @@ -528,6 +534,7 @@ def test_codecs_encoding(encoding, format): tm.assert_frame_equal(expected, df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_codecs_get_writer_reader(): # GH39247 expected = pd.DataFrame( diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 3a58dda9e8dc4..25504c7b88fdb 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -12,6 +12,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas as pd @@ -137,6 +139,7 @@ def test_compression_warning(compression_only): df.to_csv(handles.handle, compression=compression_only) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_compression_binary(compression_only): """ Binary file handles support compression. diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 22a7d3b83a459..bd7f45396d38a 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,6 +2,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import ( @@ -149,6 +151,7 @@ def test_path_localpath(self): result = tm.round_trip_localpath(df.to_feather, read_feather) tm.assert_frame_equal(df, result) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -167,6 +170,7 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_feather_dtype_backend(self, string_storage, dtype_backend): # GH#50765 df = pd.DataFrame( diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index a1dec8a2d05b4..7b78cf63f4167 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, date_range, @@ -195,6 +197,7 @@ def test_arrowparquet_options(fsspectest): @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -252,6 +255,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.single_cpu @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet def test_s3_parquet(s3_public_bucket, s3so, df1): diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4b337b5b82052..a7ae9c7049702 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under17p0 from pandas import ( @@ -145,6 +147,7 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) def test_to_csv_compression_encoding_gcs( gcs_buffer, compression_only, encoding, compression_to_extension diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 607357e709b6e..298b9115b51e4 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -13,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -36,6 +38,10 @@ from pandas.io.common import file_path_to_url +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture( params=[ diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py index 2ca11ad1f74e6..26e1412466e7b 100644 --- a/pandas/tests/io/test_http_headers.py +++ b/pandas/tests/io/test_http_headers.py @@ -7,6 +7,8 @@ import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -83,6 +85,7 @@ def stata_responder(df): return bio.getvalue() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "responder, read_method", [ diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a4021311fc963..52d6850483418 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import read_orc import pandas._testing as tm @@ -17,9 +19,12 @@ import pyarrow as pa -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.fixture diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 760a64c8d4c33..72efe989804e4 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas._config import using_copy_on_write +from pandas._config import ( + using_copy_on_write, + using_string_dtype, +) from pandas._config.config import _get_option from pandas.compat import is_platform_windows @@ -52,6 +55,7 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 7068247bbfa8b..792c532fa8032 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.compat import ( pa_version_under13p0, @@ -61,9 +63,12 @@ import sqlalchemy -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.fixture diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 6bd74faa8a3db..3c5e843e2e97b 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -345,6 +347,7 @@ def test_write_dta6(self, datapath): check_index_type=False, ) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta10(self, version): original = DataFrame( @@ -1150,6 +1153,7 @@ def test_categorical_ordering(self, file, datapath): assert parsed[col].cat.ordered assert not parsed_unordered[col].cat.ordered + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1240,6 +1244,7 @@ def test_iterator(self, datapath): from_chunks = pd.concat(itr) tm.assert_frame_equal(parsed, from_chunks) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1543,6 +1548,7 @@ def test_inf(self, infval): with tm.ensure_clean() as path: df.to_stata(path) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_pathlib(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -1578,6 +1584,7 @@ def test_value_labels_iterator(self, write_index): value_labels = dta_iter.value_labels() assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}} + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_set_index(self): # GH 17328 df = DataFrame( @@ -1611,6 +1618,7 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_writer_117(self): original = DataFrame( data=[ @@ -1717,6 +1725,7 @@ def test_invalid_date_conversion(self): with pytest.raises(ValueError, match=msg): original.to_stata(path, convert_dates={"wrong_name": "tc"}) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nonfile_writing(self, version): # GH 21041 @@ -1735,6 +1744,7 @@ def test_nonfile_writing(self, version): reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_gzip_writing(self): # writing version 117 requires seek and cannot be used with gzip df = DataFrame( @@ -1767,6 +1777,7 @@ def test_unicode_dta_118(self, datapath): tm.assert_frame_equal(unicode_df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mixed_string_strl(self): # GH 23633 output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}] @@ -1864,6 +1875,7 @@ def test_stata_119(self, datapath): reader._ensure_open() assert reader._nvar == 32999 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [118, 119, None]) def test_utf8_writer(self, version): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) @@ -2131,6 +2143,7 @@ def test_iterator_errors(datapath, chunksize): pass +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_iterator_value_labels(): # GH 31544 values = ["c_label", "b_label"] + ["a_label"] * 500 diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 900734e9f0fdf..35beda37acf51 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,6 +14,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat._optional import import_optional_dependency from pandas.errors import ( EmptyDataError, @@ -2005,6 +2007,7 @@ def test_s3_parser_consistency(s3_public_bucket_with_data, s3so): tm.assert_frame_equal(df_lxml, df_etree) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_xml_nullable_dtypes( parser, string_storage, dtype_backend, using_infer_string ): diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index a85576ff13f5c..b2d96cb1d9133 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -4,6 +4,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserWarning import pandas.util._test_decorators as td @@ -83,6 +85,7 @@ def read_xml_iterparse(data, **kwargs): # DTYPE +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dtype_single_str(parser): df_result = read_xml(StringIO(xml_types), dtype={"degrees": "str"}, parser=parser) df_iter = read_xml_iterparse( @@ -208,6 +211,7 @@ def test_wrong_dtype(xml_books, parser, iterparse): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_both_dtype_converters(parser): df_expected = DataFrame( { diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 30ec0d0affaa3..fb457f20f7a48 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -1441,6 +1443,7 @@ def test_mode_numerical_nan(self, dropna, expected): expected = Series(expected) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dropna, expected1, expected2, expected3", [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])], @@ -1468,6 +1471,7 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): expected3 = Series(expected3) tm.assert_series_equal(result, expected3) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dropna, expected1, expected2", [(True, ["foo"], ["foo"]), (False, ["foo"], [np.nan])], @@ -1605,6 +1609,7 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 550523a432a89..32567b4300152 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas as pd @@ -492,6 +494,7 @@ def test_empty(keys): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("consolidate", [True, False]) def test_resample_groupby_agg_object_dtype_all_nan(consolidate): # https://github.com/pandas-dev/pandas/issues/39329 diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 9e34d02091e69..2a52d3060e4b9 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import InvalidIndexError import pandas.util._test_decorators as td @@ -44,6 +46,7 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_copy(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1)) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index a2e22ea73fd86..0865e3cfa8149 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -4,6 +4,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -3081,6 +3083,7 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_merge_datatype_error_raises(self, using_infer_string): if using_infer_string: msg = "incompatible merge keys" diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index f9a03222c8057..b12438b6327ad 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Series, @@ -361,6 +363,7 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "default_category, expected", [ diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 272c5b3403293..cbe2c9b931ee3 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -81,6 +83,7 @@ def test_default_col_names(self, df): result2 = df.melt(id_vars=["id1", "id2"]) assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_value_vars(self, df): result3 = df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 @@ -97,6 +100,7 @@ def test_value_vars(self, df): ) tm.assert_frame_equal(result4, expected4) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("type_", (tuple, list, np.array)) def test_value_vars_types(self, type_, df): # GH 15348 @@ -177,6 +181,7 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1): with pytest.raises(ValueError, match=msg): df1.melt(id_vars=id_vars, value_vars=value_vars) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_name(self, df, var_name): result5 = df.melt(var_name=var_name) assert result5.columns.tolist() == ["var", "value"] @@ -204,6 +209,7 @@ def test_custom_var_name(self, df, var_name): ) tm.assert_frame_equal(result9, expected9) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_value_name(self, df, value_name): result10 = df.melt(value_name=value_name) assert result10.columns.tolist() == ["variable", "val"] @@ -233,6 +239,7 @@ def test_custom_value_name(self, df, value_name): ) tm.assert_frame_equal(result14, expected14) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_and_value_name(self, df, value_name, var_name): result15 = df.melt(var_name=var_name, value_name=value_name) assert result15.columns.tolist() == ["var", "val"] @@ -357,6 +364,7 @@ def test_melt_missing_columns_raises(self): with pytest.raises(KeyError, match=msg): multi.melt(["A"], ["F"], col_level=0) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_melt_mixed_int_str_id_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) @@ -1197,6 +1205,7 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", ["O", "string"]) def test_missing_stubname(self, dtype): # GH46044 @@ -1222,6 +1231,7 @@ def test_missing_stubname(self, dtype): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wide_to_long_pyarrow_string_columns(): # GH 57066 pytest.importorskip("pyarrow") diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 7b27d19483bd2..9aa13d59a586b 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1081,6 +1081,7 @@ def test_margins_dtype_len(self, data): tm.assert_frame_equal(expected, result) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) def test_pivot_table_multiindex_only(self, cols): # GH 17038 @@ -2524,6 +2525,7 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [object, "string"]) def test_pivot_integer_bug(self, dtype): df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 8d78d34e936f0..1d5d16f39e648 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.concat import union_categoricals import pandas as pd @@ -122,6 +124,7 @@ def test_union_categoricals_nan(self): exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [[], ["1"]]) def test_union_categoricals_empty(self, val, request, using_infer_string): # GH 13759 diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 34465a7c12c18..f0803ac2f2a30 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -11,6 +11,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + from pandas._libs.tslibs.timezones import maybe_get_tz from pandas.errors import SettingWithCopyError @@ -526,6 +528,7 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ser = pd.concat([ser, Series([pd.NaT])]) assert np.isnan(ser.dt.month_name(locale=time_locale).iloc[-1]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime(self): # GH 10086 ser = Series(date_range("20130101", periods=5)) @@ -568,6 +571,7 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_days(self): ser = Series(date_range("20130101", periods=5)) ser.iloc[0] = pd.NaT @@ -598,6 +602,7 @@ def test_strftime_period_days(self, using_infer_string): expected = expected.astype("string[pyarrow_numpy]") tm.assert_index_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_microsecond_resolution(self): ser = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) result = ser.dt.strftime("%Y-%m-%d %H:%M:%S") @@ -630,6 +635,7 @@ def test_strftime_period_minutes(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data", [ diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index f4992b758af74..a26e541732d36 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IndexingError from pandas import ( @@ -268,6 +270,7 @@ def test_slice(string_series, object_series, using_copy_on_write, warn_copy_on_w assert (string_series[10:20] == 0).all() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index ed681563f6fcd..fb8e5c31929b2 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.numpy import ( np_version_gt2, np_version_gte1p24, @@ -561,6 +563,7 @@ def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): tm.assert_series_equal(ser, expected) assert isinstance(ser["td"], Timedelta) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_with_expansion_type_promotion(self): # GH#12599 ser = Series(dtype=object) @@ -570,6 +573,7 @@ def test_setitem_with_expansion_type_promotion(self): expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) tm.assert_series_equal(ser, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_not_contained(self, string_series): # set item that's not contained ser = string_series.copy() @@ -873,6 +877,7 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_index_where(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -885,6 +890,7 @@ def test_index_where(self, obj, key, expected, warn, val, using_infer_string): expected_idx = Index(expected, dtype=expected.dtype) tm.assert_index_equal(res, expected_idx) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index 29dd704f6efa9..8fac40fe5fb25 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PYPY from pandas import ( @@ -140,6 +142,7 @@ def test_info_memory_usage_deep_pypy(): assert s_object.memory_usage(deep=True) == s_object.memory_usage() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, plus", [ diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 850740fac907d..c6727e023e786 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -760,6 +760,7 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_change_dtype_series(self, using_infer_string): # GH#25797 df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 1c17013d621c7..999dd90d337d9 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import Series import pandas._testing as tm @@ -24,6 +26,7 @@ def read_csv(self, path, **kwargs): return out + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_csv(self, datetime_series, string_series): # freq doesn't round-trip datetime_series.index = datetime_series.index._with_freq(None) diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index 3c70e839c8e20..8569e0f49716a 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -134,6 +136,7 @@ def test_unstack_mixed_type_name_in_multiindex( tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_unstack_multi_index_categorical_values(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index b40e2e99dae2e..1ffc9ddca5adf 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas._libs.tslibs import IncompatibleFrequency @@ -499,6 +501,7 @@ def test_ser_cmp_result_names(self, names, comparison_op): result = op(ser, cidx) assert result.name == names[2] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_comparisons(self, using_infer_string): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index d9c94e871bd4b..197ef47759bf3 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -360,6 +362,7 @@ def test_reverse_ops_with_index(self, op, expected): result = op(ser, idx) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 718d1b3ee2e83..f3a7ba2607f4a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import ( algos as libalgos, hashtable as ht, @@ -1701,6 +1703,7 @@ def test_unique_complex_numbers(self, array, expected): class TestHashTable: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ @@ -1740,6 +1743,7 @@ def test_hashtable_unique(self, htable, data, writable): reconstr = result_unique[result_inverse] tm.assert_numpy_array_equal(reconstr, s_duplicated.values) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ From d36a5aabff841f77ff51eef836d136307f333586 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 30 Jul 2024 19:07:20 +0200 Subject: [PATCH 202/396] TST (string dtype): follow-up on GH-59329 fixing new xfails (#59352) * TST (string dtype): follow-up on GH-59329 fixing new xfails * add missing strict --- pandas/_testing/asserters.py | 10 ++++++++-- .../tests/arrays/interval/test_interval_pyarrow.py | 3 +++ pandas/tests/arrays/masked/test_arrow_compat.py | 12 +++++++++--- pandas/tests/arrays/masked/test_function.py | 3 --- pandas/tests/arrays/period/test_arrow_compat.py | 4 ++++ pandas/tests/arrays/string_/test_string.py | 4 ++++ pandas/tests/arrays/test_array.py | 3 +++ pandas/tests/dtypes/test_common.py | 3 +++ pandas/tests/frame/methods/test_astype.py | 3 +++ pandas/tests/frame/test_arithmetic.py | 1 + pandas/tests/groupby/test_apply.py | 3 +++ pandas/tests/indexes/base_class/test_formats.py | 1 + pandas/tests/indexes/multi/test_setops.py | 3 +++ pandas/tests/indexes/test_old_base.py | 1 + pandas/tests/io/excel/test_readers.py | 4 +++- pandas/tests/io/json/test_json_table_schema.py | 6 ++++++ pandas/tests/io/json/test_pandas.py | 2 ++ pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 +- pandas/tests/io/parser/test_upcast.py | 2 +- pandas/tests/io/parser/usecols/test_usecols_basic.py | 3 +++ pandas/tests/io/test_feather.py | 10 ++++++---- pandas/tests/io/test_fsspec.py | 1 + pandas/tests/reshape/test_get_dummies.py | 3 +++ pandas/tests/series/methods/test_convert_dtypes.py | 3 +++ pandas/tests/test_downstream.py | 3 +++ 25 files changed, 78 insertions(+), 15 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 41d2a7344a4ed..e79e353137152 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -593,13 +593,19 @@ def raise_assert_detail( if isinstance(left, np.ndarray): left = pprint_thing(left) - elif isinstance(left, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(left, (CategoricalDtype, NumpyEADtype)): left = repr(left) + elif isinstance(left, StringDtype): + # TODO(infer_string) this special case could be avoided if we have + # a more informative repr https://github.com/pandas-dev/pandas/issues/59342 + left = f"StringDtype(storage={left.storage}, na_value={left.na_value})" if isinstance(right, np.ndarray): right = pprint_thing(right) - elif isinstance(right, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(right, (CategoricalDtype, NumpyEADtype)): right = repr(right) + elif isinstance(right, StringDtype): + right = f"StringDtype(storage={right.storage}, na_value={right.na_value})" msg += f""" [left]: {left} diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py index ef8701be81e2b..be87d5d3ef7ba 100644 --- a/pandas/tests/arrays/interval/test_interval_pyarrow.py +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -80,6 +82,7 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 7a89656bd5aa0..31765165f5f16 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -1,12 +1,18 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] + pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index 81338bca460a6..b259018cd6121 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -60,7 +58,6 @@ def test_tolist(data): tm.assert_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_numpy(): # GH#56991 diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 431309aca0df2..ff86b696c8403 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,5 +1,7 @@ import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.dtypes import PeriodDtype @@ -77,6 +79,7 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_table_roundtrip(): from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -96,6 +99,7 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_load_from_zero_chunks(): # GH-41040 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3ef7862d739cd..2663f3d7c0595 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under12p0 from pandas.core.dtypes.common import is_dtype_equal @@ -513,6 +515,7 @@ def test_arrow_array(dtype): assert arr.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -541,6 +544,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): assert result.loc[2, "a"] is result["a"].dtype.na_value +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks( dtype, string_storage2, request, using_infer_string diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 96263f498935b..76b42b643ee69 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -6,6 +6,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.api.extensions import register_extension_dtype @@ -275,6 +277,7 @@ def test_array_copy(): cet = pytz.timezone("CET") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, expected", [ diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c34c97b6e4f04..982156d7a9b1d 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -128,6 +130,7 @@ def test_dtype_equal(name1, dtype1, name2, dtype2): assert not com.is_dtype_equal(dtype1, dtype2) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("name,dtype", list(dtypes.items()), ids=lambda x: str(x)) def test_pyarrow_string_import_error(name, dtype): # GH-44276 diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 5a1e3cd786f84..b6510f384fabe 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -757,6 +759,7 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_dt64_to_string( self, frame_or_series, tz_naive_fixture, using_infer_string ): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 4e32dc0cb3f98..0407388d61f51 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -2124,6 +2124,7 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mixed_col_index_dtype(): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0ddacfab8c102..4972a6b3afa17 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -940,6 +942,7 @@ def test_func_returns_object(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 955e3be107f75..b2f345e5e6f77 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -9,6 +9,7 @@ class TestIndexRendering: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_repr_is_valid_construction_code(self): # for the case of Index, where the repr is traditional rather than # stylized diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 0abb56ecf9de7..31a5d2fb906eb 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( CategoricalIndex, @@ -758,6 +760,7 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): tm.assert_index_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_union_with_na_when_constructing_dataframe(): # GH43222 series1 = Series( diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 0e6722ff50d1e..a2256322d968b 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -232,6 +232,7 @@ def test_logical_compat(self, simple_index): with pytest.raises(TypeError, match=msg): idx.any() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): pytest.skip(f"Not a valid repr for {type(simple_index).__name__}") diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a83176cfe28f8..1b79b4bff1cea 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -660,7 +660,9 @@ def test_dtype_backend_and_dtype(self, read_ext): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail(using_string_dtype(), reason="infer_string takes precedence") + @pytest.mark.xfail( + using_string_dtype(), reason="infer_string takes precedence", strict=False + ) def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index cc101bb9c8b6d..53e819ac5eaff 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -24,6 +26,10 @@ set_default_names, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture def df_schema(): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 99c45c61fc8a4..a1d2e93e7c523 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1479,6 +1479,7 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] ) @@ -1491,6 +1492,7 @@ def test_from_json_to_json_table_index_and_columns(self, index, columns): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 9c1ae6f9b4236..da17999bba4ca 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -463,7 +463,7 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index d8c40670afcbd..01e576ba40f26 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -86,7 +86,7 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 767fba666e417..24937de163662 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserError from pandas import ( @@ -545,6 +547,7 @@ def test_usecols_additional_columns_integer_columns(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtype(all_parsers): parser = all_parsers data = """ diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index bd7f45396d38a..d169fab3f1832 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -13,9 +13,12 @@ from pandas.io.feather_format import read_feather, to_feather # isort:skip -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] pa = pytest.importorskip("pyarrow") @@ -170,7 +173,6 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_feather_dtype_backend(self, string_storage, dtype_backend): # GH#50765 df = pd.DataFrame( diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 7b78cf63f4167..65f4156cedf49 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -277,6 +277,7 @@ def test_not_present_exception(): read_csv("memory://test/test.csv") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_feather_options(fsspectest): pytest.importorskip("pyarrow") df = DataFrame({"a": [0]}) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 31260e4dcb7d2..3d9b3a6d1c7a2 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -214,6 +216,7 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dataframe_dummies_string_dtype(self, df, using_infer_string): # GH44965 df = df[["A", "B"]] diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index b0a920ba02cad..46fed9032c13d 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib import pandas as pd @@ -181,6 +183,7 @@ def test_cases(request): class TestSeriesConvertDtypes: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("params", product(*[(True, False)] * 5)) def test_convert_dtypes( self, diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 51ce73ef54300..bf88da04b73ff 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -165,6 +167,7 @@ def test_pandas_datareader(): pytest.importorskip("pandas_datareader") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pyarrow(df): pyarrow = pytest.importorskip("pyarrow") From 2211674f5527964b6cee6815fcdf455f14040dc4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Jul 2024 19:24:33 +0200 Subject: [PATCH 203/396] TST (string dtype): change any_string_dtype fixture to use actual dtype instances (#59345) * TST (string dtype): change any_string_dtype fixture to use actual dtype instances * avoid pyarrow import error during test collection * fix dtype equality in case pyarrow is not installed * keep using mode.string_storage as default for NA variant + more xfails * fix test_series_string_inference_storage_definition * remove no longer necessary xfails --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/conftest.py | 29 +++++--- pandas/core/arrays/string_.py | 6 +- .../arrays/categorical/test_constructors.py | 1 - pandas/tests/copy_view/test_astype.py | 2 - pandas/tests/dtypes/test_common.py | 3 - pandas/tests/io/parser/test_index_col.py | 3 + pandas/tests/series/test_constructors.py | 2 +- pandas/tests/strings/__init__.py | 10 ++- pandas/tests/strings/test_find_replace.py | 70 ++++++++++++++----- pandas/tests/strings/test_split_partition.py | 4 +- pandas/tests/strings/test_strings.py | 32 ++++++--- 11 files changed, 115 insertions(+), 47 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 10134c90f8eeb..a502152780a27 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1306,20 +1306,33 @@ def object_dtype(request): @pytest.fixture( params=[ - "object", - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ] + np.dtype("object"), + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ], + ids=[ + "string=object", + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + ], ) def any_string_dtype(request): """ Parametrized fixture for string dtypes. * 'object' - * 'string[python]' - * 'string[pyarrow]' + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) """ - return request.param + if isinstance(request.param, np.dtype): + return request.param + else: + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) @pytest.fixture(params=tm.DATETIME64_DTYPES) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c40f5b8f58d9e..c4f208027c9da 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -124,7 +124,7 @@ def __init__( ) -> None: # infer defaults if storage is None: - if using_string_dtype(): + if using_string_dtype() and na_value is not libmissing.NA: storage = "pyarrow" else: storage = get_option("mode.string_storage") @@ -162,7 +162,9 @@ def __eq__(self, other: object) -> bool: return True try: other = self.construct_from_string(other) - except TypeError: + except (TypeError, ImportError): + # TypeError if `other` is not a valid string for StringDtype + # ImportError if pyarrow is not installed for "string[pyarrow]" return False if isinstance(other, type(self)): return self.storage == other.storage and self.na_value is other.na_value diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 60d78a906b528..6813683cb5219 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -743,7 +743,6 @@ def test_interval(self): tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: arr = pd.arrays.StringArray._from_sequence( diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 14fd8fb5f911e..514ee6410ecf1 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -98,7 +98,6 @@ def test_astype_numpy_to_ea(): assert np.shares_memory(get_array(ser), get_array(result)) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -116,7 +115,6 @@ def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 982156d7a9b1d..c34c97b6e4f04 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -130,7 +128,6 @@ def test_dtype_equal(name1, dtype1, name2, dtype2): assert not com.is_dtype_equal(dtype1, dtype2) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("name,dtype", list(dtypes.items()), ids=lambda x: str(x)) def test_pyarrow_string_import_error(name, dtype): # GH-44276 diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index ba15d061b2deb..6dbfed2b6ae83 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -342,6 +344,7 @@ def test_infer_types_boolean_sum(all_parsers): tm.assert_frame_equal(result, expected, check_index_type=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): # GH#9435 diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 391c9361080d8..0c39cead78baf 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2138,7 +2138,7 @@ def test_series_string_inference_storage_definition(self): # returning the NA string dtype, so expected is changed from # "string[pyarrow_numpy]" to "string[pyarrow]" pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow]") + expected = Series(["a", "b"], dtype="string[python]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index e94f656fc9823..6c4bec6a23789 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -2,7 +2,15 @@ import pandas as pd -object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") + +def is_object_or_nan_string_dtype(dtype): + """ + Check if string-like dtype is following NaN semantics, i.e. is object + dtype or a NaN-variant of the StringDtype. + """ + return (isinstance(dtype, np.dtype) and dtype == "object") or ( + dtype.na_value is np.nan + ) def _convert_na_value(ser, expected): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index cd4707ac405de..df490297e2a5c 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -14,7 +14,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) # -------------------------------------------------------------------------------------- @@ -34,7 +34,9 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( np.array([False, np.nan, True, True, False], dtype=np.object_), dtype=expected_dtype, @@ -53,7 +55,9 @@ def test_contains(any_string_dtype): dtype=any_string_dtype, ) result = values.str.contains(pat) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -80,14 +84,18 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -172,7 +180,9 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( [False, False, False, True, True, False, np.nan, False, False, True], dtype=expected_dtype, @@ -213,7 +223,9 @@ def test_contains_nan(any_string_dtype): s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -231,7 +243,9 @@ def test_contains_nan(any_string_dtype): tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -641,7 +655,9 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): # New match behavior introduced in 0.13 - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") @@ -696,12 +712,16 @@ def test_match_na_kwarg(any_string_dtype): s = Series(["a", "b", np.nan], dtype=any_string_dtype) result = s.str.match("a", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -709,7 +729,9 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -725,7 +747,9 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -734,7 +758,9 @@ def test_fullmatch_dollar_literal(any_string_dtype): # GH 56652 ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) result = ser.str.fullmatch("foo\\$") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([False, False, np.nan, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -744,14 +770,18 @@ def test_fullmatch_na_kwarg(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) @@ -823,7 +853,9 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) @@ -875,7 +907,9 @@ def test_find_nan(any_string_dtype): ser = Series( ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 9ff1fc0e13ae9..423993e881b98 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -14,7 +14,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) @@ -384,7 +384,7 @@ def test_split_nan_expand(any_string_dtype): # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - if any_string_dtype in object_pyarrow_numpy: + if is_object_or_nan_string_dtype(any_string_dtype): assert all(np.isnan(x) for x in result.iloc[1]) else: assert all(x is pd.NA for x in result.iloc[1]) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index f662dfd7e2b14..015df18221b40 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -14,7 +14,7 @@ ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods -from pandas.tests.strings import object_pyarrow_numpy +from pandas.tests.strings import is_object_or_nan_string_dtype @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) @@ -41,7 +41,9 @@ def test_iter_raises(): def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -93,7 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) - if any_string_dtype in object_pyarrow_numpy: + if is_object_or_nan_string_dtype(any_string_dtype): empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: @@ -207,7 +209,9 @@ def test_ismethods(method, expected, any_string_dtype): ser = Series( ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -232,7 +236,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): ser = Series( ["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype # noqa: RUF001 ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -252,7 +258,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -283,7 +291,9 @@ def test_len(any_string_dtype): dtype=any_string_dtype, ) result = ser.str.len() - expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + "float64" if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -312,7 +322,9 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = index_or_series(expected, dtype=expected_dtype) result = getattr(obj.str, method)(sub, start, end) @@ -353,7 +365,9 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): ) def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = getattr(ser.str, method)("b") expected = Series(exp + [np.nan], dtype=expected_dtype) From fd0f7bd32526a60634a4c44fee7666fc1b9f6a19 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Jul 2024 20:25:49 +0200 Subject: [PATCH 204/396] TST (string dtype): remove usage of arrow_string_storage fixture (#59368) * TST (string dtype): remove usage of arrow_string_storage fixture * fixup --- pandas/tests/arrays/string_/test_string.py | 16 ++++++++-------- pandas/tests/arrays/string_/test_string_arrow.py | 12 ++++++------ pandas/tests/extension/test_string.py | 12 ++++++------ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 2663f3d7c0595..1c55c1d8f3e2e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -165,8 +165,8 @@ def test_add(dtype): tm.assert_series_equal(result, expected) -def test_add_2d(dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage: +def test_add_2d(dtype, request): + if dtype.storage == "pyarrow": reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.applymarker(mark) @@ -464,8 +464,8 @@ def test_min_max(method, skipna, dtype): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) -def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage and box is pd.array: +def test_min_max_numpy(method, box, dtype, request): + if dtype.storage == "pyarrow" and box is pd.array: if box is pd.array: reason = "'<=' not supported between instances of 'str' and 'NoneType'" else: @@ -479,7 +479,7 @@ def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): assert result == expected -def test_fillna_args(dtype, arrow_string_storage): +def test_fillna_args(dtype): # GH 37987 arr = pd.array(["a", pd.NA], dtype=dtype) @@ -492,7 +492,7 @@ def test_fillna_args(dtype, arrow_string_storage): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage in arrow_string_storage: + if dtype.storage == "pyarrow": msg = "Invalid value '1' for dtype string" else: msg = "Cannot set non-string value '1' into a StringArray." @@ -643,10 +643,10 @@ def test_value_counts_sort_false(dtype): tm.assert_series_equal(result, expected) -def test_memory_usage(dtype, arrow_string_storage): +def test_memory_usage(dtype): # GH 33963 - if dtype.storage in arrow_string_storage: + if dtype.storage == "pyarrow": pytest.skip(f"not applicable for {dtype.storage}") series = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 06013c3d11664..dceb93364d505 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -48,18 +48,18 @@ def test_config_bad_storage_raises(): @pytest.mark.parametrize("chunked", [True, False]) -@pytest.mark.parametrize("array", ["numpy", "pyarrow"]) -def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage): +@pytest.mark.parametrize("array_lib", ["numpy", "pyarrow"]) +def test_constructor_not_string_type_raises(array_lib, chunked): pa = pytest.importorskip("pyarrow") - array = pa if array in arrow_string_storage else np + array_lib = pa if array_lib == "pyarrow" else np - arr = array.array([1, 2, 3]) + arr = array_lib.array([1, 2, 3]) if chunked: - if array is np: + if array_lib is np: pytest.skip("chunked not applicable to numpy array") arr = pa.chunked_array(arr) - if array is np: + if array_lib is np: msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 5a72b2244d2bf..895640d9fbeaa 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -115,8 +115,8 @@ def test_is_not_string_type(self, dtype): # because StringDtype is a string type assert is_string_dtype(dtype) - def test_view(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_view(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) @@ -124,13 +124,13 @@ def test_from_dtype(self, data): # base test uses string representation of dtype pass - def test_transpose(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_transpose(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) - def test_setitem_preserves_views(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_setitem_preserves_views(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) From 014c487d846d4af9afba8d102009222218b73225 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Jul 2024 22:44:18 +0200 Subject: [PATCH 205/396] TST (string dtype): replace string_storage fixture with explicit storage/na_value keyword arguments for dtype creation (#59375) --- pandas/conftest.py | 18 ++++++++++++++++++ pandas/tests/arrays/string_/test_string.py | 7 ++++--- pandas/tests/extension/test_string.py | 5 +++-- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index a502152780a27..39d4e25e4cbfd 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1262,6 +1262,24 @@ def string_storage(request): return request.param +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ] +) +def string_dtype_arguments(request): + """ + Parametrized fixture for StringDtype storage and na_value. + + * 'python' + pd.NA + * 'pyarrow' + pd.NA + * 'pyarrow' + np.nan + """ + return request.param + + @pytest.fixture( params=[ "numpy_nullable", diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 1c55c1d8f3e2e..b1c5f4338a4ed 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -22,9 +22,10 @@ @pytest.fixture -def dtype(string_storage): - """Fixture giving StringDtype from parametrized 'string_storage'""" - return pd.StringDtype(storage=string_storage) +def dtype(string_dtype_arguments): + """Fixture giving StringDtype from parametrized storage and na_value arguments""" + storage, na_value = string_dtype_arguments + return pd.StringDtype(storage=storage, na_value=na_value) @pytest.fixture diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 895640d9fbeaa..1102d9d941663 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -58,8 +58,9 @@ def chunked(request): @pytest.fixture -def dtype(string_storage): - return StringDtype(storage=string_storage) +def dtype(string_dtype_arguments): + storage, na_value = string_dtype_arguments + return StringDtype(storage=storage, na_value=na_value) @pytest.fixture From fef0ea450ac9fa2ee299bb5c34468a2927be4ecd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Aug 2024 17:55:37 +0200 Subject: [PATCH 206/396] String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) (#59376) * String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) * add type annotation --- pandas/conftest.py | 2 -- pandas/core/config_init.py | 22 +++++++++++-- pandas/tests/arrays/string_/test_string.py | 32 +++++-------------- .../tests/arrays/string_/test_string_arrow.py | 10 +++--- pandas/tests/frame/methods/test_astype.py | 9 ++++++ .../frame/methods/test_convert_dtypes.py | 5 ++- pandas/tests/io/conftest.py | 16 ---------- 7 files changed, 45 insertions(+), 51 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 39d4e25e4cbfd..78cdc2ac5a2bb 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1248,7 +1248,6 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1257,7 +1256,6 @@ def string_storage(request): * 'python' * 'pyarrow' - * 'pyarrow_numpy' """ return request.param diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index a6625d99eaa71..4cd7e50f0ec50 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -12,7 +12,10 @@ from __future__ import annotations import os -from typing import Callable +from typing import ( + Any, + Callable, +) import pandas._config.config as cf from pandas._config.config import ( @@ -506,12 +509,27 @@ def use_inf_as_na_cb(key) -> None: ``future.infer_string`` is set to True. """ + +def is_valid_string_storage(value: Any) -> None: + legal_values = ["python", "pyarrow"] + if value not in legal_values: + msg = "Value must be one of python|pyarrow" + if value == "pyarrow_numpy": + # TODO: we can remove extra message after 3.0 + msg += ( + ". 'pyarrow_numpy' was specified, but this option should be " + "enabled using pandas.options.future.infer_string instead" + ) + raise ValueError(msg) + + with cf.config_prefix("mode"): cf.register_option( "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), + # validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_valid_string_storage, ) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b1c5f4338a4ed..6bf20b6fcd5f7 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -516,19 +516,12 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): +def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -536,30 +529,21 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + expected = df.astype(f"string[{string_storage}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is result["a"].dtype.na_value -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_load_from_zero_chunks( - dtype, string_storage2, request, using_infer_string -): +def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -569,10 +553,10 @@ def test_arrow_load_from_zero_chunks( assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + expected = df.astype(f"string[{string_storage}]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index dceb93364d505..d38b728aaf120 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -27,16 +27,18 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage != "pyarrow_numpy": - request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) - if string_storage == "pyarrow_numpy": + if using_infer_string and string_storage == "python": + # python string storage with na_value=NaN is not yet implemented request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype(string_storage) + dtype = StringDtype( + string_storage, na_value=np.nan if using_infer_string else pd.NA + ) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index b6510f384fabe..ea9cc22d93758 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -912,3 +912,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val): with option_context("mode.string_storage", string_storage): df.astype("string", copy=False) tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) +def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val): + # GH#51073 - variant of the above test with explicit dtype instances + df = DataFrame({"a": ["a", "b", val]}) + expected = df.copy() + df.astype(any_string_dtype) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 91fa81b5bee2e..59779234b46d9 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -10,6 +10,8 @@ class TestConvertDtypes: + # TODO convert_dtypes should not use NaN variant of string dtype, but always NA + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) @@ -18,9 +20,6 @@ def test_convert_dtypes( ): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here - if using_infer_string: - string_storage = "pyarrow_numpy" - df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index ab6cacc4cc860..bdefadf3dbec0 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -224,19 +224,3 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] - - -@pytest.fixture( - params=[ - "python", - pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - ] -) -def string_storage(request): - """ - Parametrized fixture for pd.options.mode.string_storage. - - * 'python' - * 'pyarrow' - """ - return request.param From 6fad5c98d78c6c733b21e282badbb6b5b135ba7c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 6 Aug 2024 19:06:22 +0200 Subject: [PATCH 207/396] API/TST: expand tests for string any/all reduction + fix pyarrow-based implementation (#59414) --- pandas/tests/reductions/test_reductions.py | 51 +++++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index fb457f20f7a48..cb9fd9e8da0df 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1091,25 +1091,62 @@ def test_any_all_datetimelike(self): assert df.any().all() assert not df.all().any() - def test_any_all_pyarrow_string(self): + def test_any_all_string_dtype(self, any_string_dtype): # GH#54591 - pytest.importorskip("pyarrow") - ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + if ( + isinstance(any_string_dtype, pd.StringDtype) + and any_string_dtype.na_value is pd.NA + ): + # the nullable string dtype currently still raise an error + # https://github.com/pandas-dev/pandas/issues/51939 + ser = Series(["a", "b"], dtype=any_string_dtype) + with pytest.raises(TypeError): + ser.any() + with pytest.raises(TypeError): + ser.all() + return + + ser = Series(["", "a"], dtype=any_string_dtype) assert ser.any() assert not ser.all() + assert ser.any(skipna=False) + assert not ser.all(skipna=False) - ser = Series([None, "a"], dtype="string[pyarrow_numpy]") + ser = Series([np.nan, "a"], dtype=any_string_dtype) assert ser.any() assert ser.all() - assert not ser.all(skipna=False) + assert ser.any(skipna=False) + assert ser.all(skipna=False) # NaN is considered truthy - ser = Series([None, ""], dtype="string[pyarrow_numpy]") + ser = Series([np.nan, ""], dtype=any_string_dtype) assert not ser.any() assert not ser.all() + assert ser.any(skipna=False) # NaN is considered truthy + assert not ser.all(skipna=False) - ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser = Series(["a", "b"], dtype=any_string_dtype) assert ser.any() assert ser.all() + assert ser.any(skipna=False) + assert ser.all(skipna=False) + + ser = Series([], dtype=any_string_dtype) + assert not ser.any() + assert ser.all() + assert not ser.any(skipna=False) + assert ser.all(skipna=False) + + ser = Series([""], dtype=any_string_dtype) + assert not ser.any() + assert not ser.all() + assert not ser.any(skipna=False) + assert not ser.all(skipna=False) + + ser = Series([np.nan], dtype=any_string_dtype) + assert not ser.any() + assert ser.all() + assert ser.any(skipna=False) # NaN is considered truthy + assert ser.all(skipna=False) # NaN is considered truthy def test_timedelta64_analytics(self): # index min/max From 67f9df4e6d515b1baf833cf6fe5a362e8c90bd3c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 14 Aug 2024 12:15:25 -0400 Subject: [PATCH 208/396] String dtype: implement object-dtype based StringArray variant with NumPy semantics (#58451) Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_libs/lib.pyx | 2 +- pandas/_testing/asserters.py | 18 ++ pandas/compat/__init__.py | 2 + pandas/compat/pyarrow.py | 2 + pandas/conftest.py | 4 + pandas/core/arrays/string_.py | 183 +++++++++++++++++++-- pandas/core/construction.py | 4 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/internals/construction.py | 2 +- pandas/io/_util.py | 4 +- pandas/io/pytables.py | 7 +- pandas/tests/arrays/string_/test_string.py | 22 ++- pandas/tests/series/test_constructors.py | 26 ++- pandas/tests/strings/test_find_replace.py | 2 +- 14 files changed, 232 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1222b33aac3c1..5d8a04664b0e4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2728,7 +2728,7 @@ def maybe_convert_objects(ndarray[object] objects, if using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index e79e353137152..a1f9844669c8c 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -811,6 +811,24 @@ def assert_extension_array_equal( left_na, right_na, obj=f"{obj} NA mask", index_values=index_values ) + # Specifically for StringArrayNumpySemantics, validate here we have a valid array + if ( + isinstance(left.dtype, StringDtype) + and left.dtype.storage == "python" + and left.dtype.na_value is np.nan + ): + assert np.all( + [np.isnan(val) for val in left._ndarray[left_na]] # type: ignore[attr-defined] + ), "wrong missing value sentinels" + if ( + isinstance(right.dtype, StringDtype) + and right.dtype.storage == "python" + and right.dtype.na_value is np.nan + ): + assert np.all( + [np.isnan(val) for val in right._ndarray[right_na]] # type: ignore[attr-defined] + ), "wrong missing value sentinels" + left_valid = left[~left_na].to_numpy(dtype=object) right_valid = right[~right_na].to_numpy(dtype=object) if check_exact: diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5ada6d705172f..38fb0188df5ff 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -25,6 +25,7 @@ import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( + HAS_PYARROW, pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, @@ -190,6 +191,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under14p1", "pa_version_under16p0", "pa_version_under17p0", + "HAS_PYARROW", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 457d26766520d..7fa197c4a9824 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -17,6 +17,7 @@ pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") + HAS_PYARROW = True except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -27,3 +28,4 @@ pa_version_under15p0 = True pa_version_under16p0 = True pa_version_under17p0 = True + HAS_PYARROW = False diff --git a/pandas/conftest.py b/pandas/conftest.py index 78cdc2ac5a2bb..433ea7275223d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1265,6 +1265,7 @@ def string_storage(request): ("python", pd.NA), pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), ] ) def string_dtype_arguments(request): @@ -1326,12 +1327,14 @@ def object_dtype(request): ("python", pd.NA), pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), ], ids=[ "string=object", "string=string[python]", "string=string[pyarrow]", "string=str[pyarrow]", + "string=str[python]", ], ) def any_string_dtype(request): @@ -1341,6 +1344,7 @@ def any_string_dtype(request): * 'string[python]' (NA variant) * 'string[pyarrow]' (NA variant) * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) """ if isinstance(request.param, np.dtype): return request.param diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c4f208027c9da..6a3cd50b9c288 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,9 +1,12 @@ from __future__ import annotations +import operator from typing import ( TYPE_CHECKING, + Any, ClassVar, Literal, + cast, ) import numpy as np @@ -19,7 +22,10 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import ensure_string_array -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + HAS_PYARROW, + pa_version_under10p1, +) from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -37,7 +43,10 @@ pandas_dtype, ) -from pandas.core import ops +from pandas.core import ( + nanops, + ops, +) from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import ( @@ -125,7 +134,10 @@ def __init__( # infer defaults if storage is None: if using_string_dtype() and na_value is not libmissing.NA: - storage = "pyarrow" + if HAS_PYARROW: + storage = "pyarrow" + else: + storage = "python" else: storage = get_option("mode.string_storage") @@ -243,10 +255,12 @@ def construct_array_type( # type: ignore[override] ArrowStringArrayNumpySemantics, ) - if self.storage == "python": + if self.storage == "python" and self._na_value is libmissing.NA: return StringArray elif self.storage == "pyarrow" and self._na_value is libmissing.NA: return ArrowStringArray + elif self.storage == "python": + return StringArrayNumpySemantics else: return ArrowStringArrayNumpySemantics @@ -282,7 +296,7 @@ def __from_arrow__( # convert chunk by chunk to numpy and concatenate then, to avoid # overflow for large string data when concatenating the pyarrow arrays arr = arr.to_numpy(zero_copy_only=False) - arr = ensure_string_array(arr, na_value=libmissing.NA) + arr = ensure_string_array(arr, na_value=self.na_value) results.append(arr) if len(chunks) == 0: @@ -292,11 +306,7 @@ def __from_arrow__( # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) - NDArrayBacked.__init__( - new_string_array, - arr, - StringDtype(storage="python"), - ) + NDArrayBacked.__init__(new_string_array, arr, self) return new_string_array @@ -404,6 +414,8 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] # undo the NumpyExtensionArray hack _typ = "extension" + _storage = "python" + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values, copy: bool = False) -> None: values = extract_array(values) @@ -411,7 +423,11 @@ def __init__(self, values, copy: bool = False) -> None: super().__init__(values, copy=copy) if not isinstance(values, type(self)): self._validate() - NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) + NDArrayBacked.__init__( + self, + self._ndarray, + StringDtype(storage=self._storage, na_value=self._na_value), + ) def _validate(self): """Validate that we only store NA or strings.""" @@ -429,20 +445,36 @@ def _validate(self): else: lib.convert_nans_to_NA(self._ndarray) + def _validate_scalar(self, value): + # used by NDArrayBackedExtensionIndex.insert + if isna(value): + return self.dtype.na_value + elif not isinstance(value, str): + raise TypeError( + f"Cannot set non-string value '{value}' into a string array." + ) + return value + @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) assert isinstance(dtype, StringDtype) and dtype.storage == "python" + else: + if using_string_dtype(): + dtype = StringDtype(storage="python", na_value=np.nan) + else: + dtype = StringDtype(storage="python") from pandas.core.arrays.masked import BaseMaskedArray + na_value = dtype.na_value if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = libmissing.NA + result[na_values] = na_value else: if lib.is_pyarrow_array(scalars): @@ -451,12 +483,12 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal # zero_copy_only to True which caused problems see GH#52076 scalars = np.array(scalars) # convert non-na-likes to str, and nan-likes to StringDtype().na_value - result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) + result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) + NDArrayBacked.__init__(new_string_array, result, dtype) return new_string_array @@ -506,7 +538,7 @@ def __setitem__(self, key, value) -> None: # validate new items if scalar_value: if isna(value): - value = libmissing.NA + value = self.dtype.na_value elif not isinstance(value, str): raise TypeError( f"Cannot set non-string value '{value}' into a StringArray." @@ -520,7 +552,7 @@ def __setitem__(self, key, value) -> None: mask = isna(value) if mask.any(): value = value.copy() - value[isna(value)] = libmissing.NA + value[isna(value)] = self.dtype.na_value super().__setitem__(key, value) @@ -633,9 +665,9 @@ def _cmp_method(self, other, op): if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") - result[mask] = libmissing.NA + result[mask] = self.dtype.na_value result[valid] = op(self._ndarray[valid], other) - return StringArray(result) + return self._from_backing_data(result) else: # logical result = np.zeros(len(self._ndarray), dtype="bool") @@ -704,3 +736,118 @@ def _str_map( # or .findall returns a list). # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) + + +class StringArrayNumpySemantics(StringArray): + _storage = "python" + _na_value = np.nan + + def _validate(self) -> None: + """Validate that we only store NaN or strings.""" + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + raise ValueError( + "StringArrayNumpySemantics requires a sequence of strings or NaN" + ) + if self._ndarray.dtype != "object": + raise ValueError( + "StringArrayNumpySemantics requires a sequence of strings or NaN. Got " + f"'{self._ndarray.dtype}' dtype instead." + ) + # TODO validate or force NA/None to NaN + + @classmethod + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: + if dtype is None: + dtype = StringDtype(storage="python", na_value=np.nan) + return super()._from_sequence(scalars, dtype=dtype, copy=copy) + + def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: + # need to override NumpyExtensionArray._from_backing_data to ensure + # we always preserve the dtype + return NDArrayBacked._from_backing_data(self, arr) + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if name in ["any", "all"]: + if name == "any": + return nanops.nanany(self._ndarray, skipna=skipna) + else: + return nanops.nanall(self._ndarray, skipna=skipna) + else: + return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: + # the masked_reductions use pd.NA + if result is libmissing.NA: + return np.nan + return super()._wrap_reduction_result(axis, result) + + def _cmp_method(self, other, op): + result = super()._cmp_method(other, op) + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) + else: + return result.to_numpy(np.bool_, na_value=False) + + def value_counts(self, dropna: bool = True) -> Series: + from pandas.core.algorithms import value_counts_internal as value_counts + + result = value_counts(self._ndarray, sort=False, dropna=dropna) + result.index = result.index.astype(self.dtype) + return result + + # ------------------------------------------------------------------------ + # String methods interface + _str_na_value = np.nan + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + convert = convert and not np.all(mask) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + if is_integer_dtype(dtype): + na_value = 0 + else: + na_value = True + + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(cast(type, dtype)), + ) + if na_value_is_na and mask.any(): + if is_integer_dtype(dtype): + result = result.astype("float64") + else: + result = result.astype("object") + result[mask] = np.nan + return result + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 748b9e4947bec..5bccca9cfbd47 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -569,7 +569,7 @@ def sanitize_array( if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype("pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data @@ -606,7 +606,7 @@ def sanitize_array( elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5d4e56cd8f800..9580ab1b520e0 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 144416fc11691..f3dbacc02bec9 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -376,7 +376,7 @@ def ndarray_to_mgr( nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): - dtype = StringDtype(storage="pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) obj_columns = list(values) block_values = [ diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 2dc3b0e6c80ef..68fcfcf65e0c2 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -31,6 +31,6 @@ def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") return { - pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), - pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), + pa.string(): pd.StringDtype(na_value=np.nan), + pa.large_string(): pd.StringDtype(na_value=np.nan), }.get diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 12bb93a63f850..2e38303caa354 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -76,6 +76,7 @@ PeriodIndex, RangeIndex, Series, + StringDtype, TimedeltaIndex, concat, isna, @@ -3225,7 +3226,7 @@ def read( values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) if using_string_dtype() and is_string_array(values, skipna=True): - result = result.astype("string[pyarrow_numpy]") + result = result.astype(StringDtype(na_value=np.nan)) return result def write(self, obj, **kwargs) -> None: @@ -3294,7 +3295,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) if using_string_dtype() and is_string_array(values, skipna=True): - df = df.astype("string[pyarrow_numpy]") + df = df.astype(StringDtype(na_value=np.nan)) dfs.append(df) if len(dfs) > 0: @@ -4685,7 +4686,7 @@ def read( values, # type: ignore[arg-type] skipna=True, ): - df = df.astype("string[pyarrow_numpy]") + df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) if len(frames) == 1: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 6bf20b6fcd5f7..b51b01c2b5168 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -15,6 +15,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ( ArrowStringArray, ArrowStringArrayNumpySemantics, @@ -75,6 +76,9 @@ def test_repr(dtype): elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + elif dtype.storage == "python" and dtype.na_value is np.nan: + arr_name = "StringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" @@ -90,14 +94,14 @@ def test_none_to_nan(cls, dtype): def test_setitem_validates(cls, dtype): arr = cls._from_sequence(["a", "b"], dtype=dtype) - if cls is pd.arrays.StringArray: + if dtype.storage == "python": msg = "Cannot set non-string value '10' into a StringArray." else: msg = "Scalar must be NA or str" with pytest.raises(TypeError, match=msg): arr[0] = 10 - if cls is pd.arrays.StringArray: + if dtype.storage == "python": msg = "Must provide strings." else: msg = "Scalar must be NA or str" @@ -339,6 +343,8 @@ def test_comparison_methods_array(comparison_op, dtype): def test_constructor_raises(cls): if cls is pd.arrays.StringArray: msg = "StringArray requires a sequence of strings or pandas.NA" + elif cls is StringArrayNumpySemantics: + msg = "StringArrayNumpySemantics requires a sequence of strings or NaN" else: msg = "Unsupported type '' for ArrowExtensionArray" @@ -348,7 +354,7 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) - if cls is pd.arrays.StringArray: + if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics: # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs # for string dtype cls(np.array(["a", np.nan], dtype=object)) @@ -389,6 +395,8 @@ def test_from_sequence_no_mutate(copy, cls, dtype): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) + elif cls is StringArrayNumpySemantics: + expected = cls(nan_arr) else: expected = cls(na_arr) @@ -671,7 +679,11 @@ def test_isin(dtype, fixed_now_ts): tm.assert_series_equal(result, expected) result = s.isin(["a", pd.NA]) - expected = pd.Series([True, False, True]) + if dtype.storage == "python" and dtype.na_value is np.nan: + # TODO(infer_string) we should make this consistent + expected = pd.Series([True, False, False]) + else: + expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) result = s.isin([]) @@ -695,7 +707,7 @@ def test_setitem_scalar_with_mask_validation(dtype): # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) - if type(ser.array) is pd.arrays.StringArray: + if dtype.storage == "python": msg = "Cannot set non-string value" else: msg = "Scalar must be NA or str" diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 0c39cead78baf..0aaa8ddcfda0c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -14,6 +14,7 @@ iNaT, lib, ) +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -2094,11 +2095,10 @@ def test_series_from_index_dtype_equal_does_not_copy(self): def test_series_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Series(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", "b"]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) expected = Series(["a", 1], dtype="object") @@ -2109,35 +2109,33 @@ def test_series_string_inference(self): @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) def test_series_string_with_na_inference(self, na_value): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Series(["a", na_value], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", na_value]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", None], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_scalar(self): # GH#54430 - pytest.importorskip("pyarrow") - expected = Series("a", index=[1], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series("a", index=[1]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series("a", index=[1], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series(np.array(["a", "b"])) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_storage_definition(self): # https://github.com/pandas-dev/pandas/issues/54793 # but after PDEP-14 (string dtype), it was decided to keep dtype="string" # returning the NA string dtype, so expected is changed from - # "string[pyarrow_numpy]" to "string[pyarrow]" - pytest.importorskip("pyarrow") + # "string[pyarrow_numpy]" to "string[python]" expected = Series(["a", "b"], dtype="string[python]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") @@ -2153,10 +2151,10 @@ def test_series_constructor_infer_string_scalar(self): def test_series_string_inference_na_first(self): # GH#55655 - pytest.importorskip("pyarrow") - expected = Series([pd.NA, "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): result = Series([pd.NA, "b"]) + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) + expected = Series([None, "b"], dtype=dtype) tm.assert_series_equal(result, expected) def test_inference_on_pandas_objects(self): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index df490297e2a5c..2d7c9754ee319 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -236,7 +236,7 @@ def test_contains_nan(any_string_dtype): result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) - elif any_string_dtype == "string[pyarrow_numpy]": + elif any_string_dtype.na_value is np.nan: expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") From d4b669eee0d941e7983934007a78a9bd71e81278 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 14 Aug 2024 12:16:06 -0400 Subject: [PATCH 209/396] REF (string dtype): de-duplicate _str_map methods (#59443) * REF: de-duplicate _str_map methods * mypy fixup --- pandas/core/arrays/string_.py | 138 ++++++++++++++++------------- pandas/core/arrays/string_arrow.py | 117 ++++++++++-------------- 2 files changed, 124 insertions(+), 131 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 6a3cd50b9c288..3cbacec9d411d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -315,6 +315,8 @@ class BaseStringArray(ExtensionArray): Mixin class for StringArray, ArrowStringArray. """ + dtype: StringDtype + @doc(ExtensionArray.tolist) def tolist(self): if self.ndim > 1: @@ -328,6 +330,37 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: raise ValueError return cls._from_sequence(scalars, dtype=dtype) + def _str_map_str_or_object( + self, + dtype, + na_value, + arr: np.ndarray, + f, + mask: npt.NDArray[np.bool_], + convert: bool, + ): + # _str_map helper for case where dtype is either string dtype or object + if is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + if self.dtype.storage == "pyarrow": + import pyarrow as pa + + result = pa.array( + result, mask=mask, type=pa.large_string(), from_pandas=True + ) + # error: Too many arguments for "BaseStringArray" + return type(self)(result) # type: ignore[call-arg] + + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" @@ -682,9 +715,53 @@ def _cmp_method(self, other, op): # base class "NumpyExtensionArray" defined the type as "float") _str_na_value = libmissing.NA # type: ignore[assignment] + def _str_map_nan_semantics( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + convert = convert and not np.all(mask) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + if is_integer_dtype(dtype): + na_value = 0 + else: + na_value = True + + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(cast(type, dtype)), + ) + if na_value_is_na and mask.any(): + if is_integer_dtype(dtype): + result = result.astype("float64") + else: + result = result.astype("object") + result[mask] = np.nan + return result + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True ): + if self.dtype.na_value is np.nan: + return self._str_map_nan_semantics( + f, na_value=na_value, dtype=dtype, convert=convert + ) + from pandas.arrays import BooleanArray if dtype is None: @@ -724,18 +801,8 @@ def _str_map( return constructor(result, mask) - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) class StringArrayNumpySemantics(StringArray): @@ -802,52 +869,3 @@ def value_counts(self, dropna: bool = True) -> Series: # ------------------------------------------------------------------------ # String methods interface _str_na_value = np.nan - - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - convert = convert and not np.all(mask) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - na_value_is_na = isna(na_value) - if na_value_is_na: - if is_integer_dtype(dtype): - na_value = 0 - else: - na_value = True - - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(cast(type, dtype)), - ) - if na_value_is_na and mask.any(): - if is_integer_dtype(dtype): - result = result.astype("float64") - else: - result = result.astype("object") - result[mask] = np.nan - return result - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - return type(self)(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 94f6f9064885e..607f6f7e4246a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -7,6 +7,7 @@ TYPE_CHECKING, Callable, Union, + cast, ) import warnings @@ -25,9 +26,7 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_integer_dtype, - is_object_dtype, is_scalar, - is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.missing import isna @@ -284,9 +283,53 @@ def _data(self): # base class "ObjectStringArrayMixin" defined the type as "float") _str_na_value = libmissing.NA # type: ignore[assignment] + def _str_map_nan_semantics( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + if is_integer_dtype(dtype): + na_value = np.nan + else: + na_value = False + + dtype = np.dtype(cast(type, dtype)) + if mask.any(): + # numpy int/bool dtypes cannot hold NaNs so we must convert to + # float64 for int (to match maybe_convert_objects) or + # object for bool (again to match maybe_convert_objects) + if is_integer_dtype(dtype): + dtype = np.dtype("float64") + else: + dtype = np.dtype(object) + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=dtype, + ) + return result + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True ): + if self.dtype.na_value is np.nan: + return self._str_map_nan_semantics( + f, na_value=na_value, dtype=dtype, convert=convert + ) + # TODO: de-duplicate with StringArray method. This method is moreless copy and # paste. @@ -330,21 +373,8 @@ def _str_map( return constructor(result, mask) - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - result = pa.array( - result, mask=mask, type=pa.large_string(), from_pandas=True - ) - return type(self)(result) else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -615,61 +645,6 @@ def __getattribute__(self, item): return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - if is_integer_dtype(dtype): - na_value = np.nan - else: - na_value = False - try: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) - return result - - except ValueError: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - ) - if convert and result.dtype == object: - result = lib.maybe_convert_objects(result) - return result - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - result = pa.array( - result, mask=mask, type=pa.large_string(), from_pandas=True - ) - return type(self)(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) - def _convert_int_dtype(self, result): if isinstance(result, pa.Array): result = result.to_numpy(zero_copy_only=False) From b55191d1f397bbd39806214d0c76bcea38ffbf36 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 20 Sep 2024 13:54:03 -0400 Subject: [PATCH 210/396] String dtype: use 'str' string alias and representation for NaN-variant of the dtype (#59388) --- pandas/_testing/__init__.py | 6 +++- pandas/core/arrays/arrow/array.py | 5 ++- pandas/core/arrays/string_.py | 24 +++++++++---- pandas/core/frame.py | 4 ++- pandas/core/interchange/utils.py | 7 +++- pandas/tests/apply/test_numba.py | 2 +- pandas/tests/apply/test_series_apply.py | 2 +- pandas/tests/arrays/boolean/test_astype.py | 12 +++++-- .../tests/arrays/categorical/test_astype.py | 2 +- pandas/tests/arrays/categorical/test_repr.py | 2 +- pandas/tests/arrays/floating/test_astype.py | 17 ++++++--- pandas/tests/arrays/integer/test_dtypes.py | 17 ++++++--- .../arrays/interval/test_interval_pyarrow.py | 3 -- .../tests/arrays/period/test_arrow_compat.py | 4 --- pandas/tests/arrays/string_/test_string.py | 35 +++++++++++-------- .../tests/arrays/string_/test_string_arrow.py | 6 ++-- pandas/tests/arrays/test_datetimelike.py | 7 ++-- pandas/tests/dtypes/test_common.py | 19 ++++++++++ pandas/tests/dtypes/test_dtypes.py | 2 +- pandas/tests/extension/test_string.py | 14 ++++++++ pandas/tests/frame/indexing/test_indexing.py | 8 ++--- pandas/tests/frame/indexing/test_set_value.py | 2 +- pandas/tests/frame/methods/test_astype.py | 6 ++-- .../frame/methods/test_get_numeric_data.py | 4 ++- pandas/tests/frame/methods/test_nlargest.py | 2 +- .../tests/frame/methods/test_reset_index.py | 4 +-- .../tests/frame/methods/test_select_dtypes.py | 19 ++++++++-- pandas/tests/frame/methods/test_to_csv.py | 5 +-- pandas/tests/frame/test_block_internals.py | 4 ++- pandas/tests/frame/test_constructors.py | 31 +++++++++++----- pandas/tests/frame/test_stack_unstack.py | 6 +++- pandas/tests/generic/test_to_xarray.py | 4 +-- pandas/tests/groupby/test_apply.py | 17 +++------ pandas/tests/groupby/test_categorical.py | 2 +- pandas/tests/groupby/test_groupby.py | 7 ++-- pandas/tests/groupby/test_numeric_only.py | 2 ++ .../tests/indexes/base_class/test_formats.py | 1 - .../tests/indexes/multi/test_constructors.py | 2 +- pandas/tests/indexes/multi/test_get_set.py | 6 ++-- pandas/tests/indexes/object/test_indexing.py | 2 ++ pandas/tests/indexes/test_base.py | 4 +-- pandas/tests/indexing/multiindex/test_loc.py | 2 +- pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/indexing/test_loc.py | 15 +++++--- pandas/tests/indexing/test_partial.py | 2 +- pandas/tests/internals/test_internals.py | 2 +- pandas/tests/io/excel/test_writers.py | 2 +- .../tests/io/json/test_json_table_schema.py | 6 ++-- pandas/tests/io/json/test_pandas.py | 7 ++-- pandas/tests/io/pytables/test_keys.py | 7 +++- pandas/tests/io/pytables/test_subclass.py | 3 ++ pandas/tests/io/test_common.py | 1 + pandas/tests/io/test_fsspec.py | 2 +- pandas/tests/io/test_gcs.py | 1 + pandas/tests/io/xml/test_xml_dtypes.py | 4 --- .../tests/reshape/concat/test_categorical.py | 4 +-- pandas/tests/reshape/concat/test_empty.py | 4 +-- pandas/tests/reshape/concat/test_index.py | 6 ++-- pandas/tests/reshape/merge/test_join.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 10 +++--- pandas/tests/reshape/merge/test_merge_asof.py | 4 +-- pandas/tests/reshape/test_from_dummies.py | 2 +- pandas/tests/reshape/test_get_dummies.py | 2 +- .../series/accessors/test_dt_accessor.py | 2 +- pandas/tests/series/indexing/test_delitem.py | 11 +++--- pandas/tests/series/indexing/test_getitem.py | 4 +-- pandas/tests/series/indexing/test_setitem.py | 2 +- pandas/tests/series/methods/test_astype.py | 4 +-- pandas/tests/series/methods/test_map.py | 14 +++----- pandas/tests/series/methods/test_rename.py | 2 +- .../tests/series/methods/test_reset_index.py | 2 +- pandas/tests/series/methods/test_to_csv.py | 7 ++-- pandas/tests/series/test_constructors.py | 11 ++++-- pandas/tests/series/test_formats.py | 4 +-- pandas/tests/test_downstream.py | 3 -- pandas/tests/util/test_assert_frame_equal.py | 4 +-- pandas/tests/util/test_assert_index_equal.py | 2 +- pandas/tests/util/test_assert_series_equal.py | 6 ++-- pandas/tests/window/test_api.py | 2 +- 79 files changed, 306 insertions(+), 192 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 994b351acf42c..10c1c490551fb 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -14,6 +14,7 @@ import numpy as np +from pandas._config import using_string_dtype from pandas._config.localization import ( can_set_locale, get_locales, @@ -110,7 +111,10 @@ ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES] COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] -STRING_DTYPES: list[Dtype] = [str, "str", "U"] +if using_string_dtype(): + STRING_DTYPES: list[Dtype] = [str, "U"] +else: + STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a156042ac0c0e..6c44b7759f0e2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -570,7 +570,10 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + if ( + isinstance(self._dtype, StringDtype) + and self._dtype.storage == "pyarrow" + ): # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3cbacec9d411d..0929791ded58c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - ClassVar, Literal, cast, ) @@ -114,9 +113,12 @@ class StringDtype(StorageExtensionDtype): string[pyarrow] """ - # error: Cannot override instance variable (previously declared on - # base class "StorageExtensionDtype") with class variable - name: ClassVar[str] = "string" # type: ignore[misc] + @property + def name(self) -> str: # type: ignore[override] + if self._na_value is libmissing.NA: + return "string" + else: + return "str" #: StringDtype().na_value uses pandas.NA except the implementation that # follows NumPy semantics, which uses nan. @@ -133,7 +135,7 @@ def __init__( ) -> None: # infer defaults if storage is None: - if using_string_dtype() and na_value is not libmissing.NA: + if na_value is not libmissing.NA: if HAS_PYARROW: storage = "pyarrow" else: @@ -166,11 +168,19 @@ def __init__( self.storage = storage self._na_value = na_value + def __repr__(self) -> str: + if self._na_value is libmissing.NA: + return f"{self.name}[{self.storage}]" + else: + # TODO add more informative repr + return self.name + def __eq__(self, other: object) -> bool: # we need to override the base class __eq__ because na_value (NA or NaN) # cannot be checked with normal `==` if isinstance(other, str): - if other == self.name: + # TODO should dtype == "string" work for the NaN variant? + if other == "string" or other == self.name: # noqa: PLR1714 return True try: other = self.construct_from_string(other) @@ -227,6 +237,8 @@ def construct_from_string(cls, string) -> Self: ) if string == "string": return cls() + elif string == "str" and using_string_dtype(): + return cls(na_value=np.nan) elif string == "string[python]": return cls(storage="python") elif string == "string[pyarrow]": diff --git a/pandas/core/frame.py b/pandas/core/frame.py index afcd4d014316e..1403fc2ceaaf8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4979,7 +4979,9 @@ def select_dtypes(self, include=None, exclude=None) -> Self: ----- * To select all *numeric* types, use ``np.number`` or ``'number'`` * To select strings you must use the ``object`` dtype, but note that - this will return *all* object dtype columns + this will return *all* object dtype columns. With + ``pd.options.future.infer_string`` enabled, using ``"str"`` will + work to select all string columns. * See the `numpy dtype hierarchy `__ * To select datetimes, use ``np.datetime64``, ``'datetime'`` or diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index fd1c7c9639242..035a1f8abdbc5 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -135,7 +135,12 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: if format_str is not None: return format_str - if lib.is_np_dtype(dtype, "M"): + if isinstance(dtype, pd.StringDtype): + # TODO(infer_string) this should be LARGE_STRING for pyarrow storage, + # but current tests don't cover this distinction + return ArrowCTypes.STRING + + elif lib.is_np_dtype(dtype, "M"): # Selecting the first char of resolution string: # dtype.str -> ' 'n' resolution = np.datetime_data(dtype)[0][0] diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index aee9100702350..6ac0b49f0e4e7 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -110,7 +110,7 @@ def test_numba_unsupported_dtypes(apply_axis): with pytest.raises( ValueError, - match="Column b must have a numeric dtype. Found 'object|string' instead", + match="Column b must have a numeric dtype. Found 'object|str' instead", ): df.apply(f, engine="numba", axis=apply_axis) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index df24fa08f48e1..69f84ca74ab0b 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -244,7 +244,7 @@ def test_apply_categorical(by_row, using_infer_string): result = ser.apply(lambda x: "A") exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]]) diff --git a/pandas/tests/arrays/boolean/test_astype.py b/pandas/tests/arrays/boolean/test_astype.py index 932e903c0e448..8c2672218f273 100644 --- a/pandas/tests/arrays/boolean/test_astype.py +++ b/pandas/tests/arrays/boolean/test_astype.py @@ -5,7 +5,7 @@ import pandas._testing as tm -def test_astype(): +def test_astype(using_infer_string): # with missing values arr = pd.array([True, False, None], dtype="boolean") @@ -20,8 +20,14 @@ def test_astype(): tm.assert_numpy_array_equal(result, expected) result = arr.astype("str") - expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array( + ["True", "False", None], dtype=pd.StringDtype(na_value=np.nan) + ) + tm.assert_extension_array_equal(result, expected) + else: + expected = np.array(["True", "False", ""], dtype=f"{tm.ENDIAN}U5") + tm.assert_numpy_array_equal(result, expected) # no missing values arr = pd.array([True, False, True], dtype="boolean") diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index a2a53af6ab1ad..ee930ac84aaf2 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -89,7 +89,7 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): cat.astype(float) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index e2e5d47f50209..3a2c489920eb0 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -22,7 +22,7 @@ def test_print(self, using_infer_string): if using_infer_string: expected = [ "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, string): [a < b < c]", + "Categories (3, str): [a < b < c]", ] else: expected = [ diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ade3dbd2c99da..ccf644b34051d 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -63,12 +63,21 @@ def test_astype_to_integer_array(): tm.assert_extension_array_equal(result, expected) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([0.1, 0.2, None], dtype="Float64") - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_extension_array_equal(a.astype("str"), expected) + + # TODO(infer_string) this should also be a string array like above + expected = np.array(["0.1", "0.2", ""], dtype="U32") + tm.assert_numpy_array_equal(a.astype(str), expected) + else: + expected = np.array(["0.1", "0.2", ""], dtype="U32") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_copy(): diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 8620763988e06..7be00e569b3fe 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -278,12 +278,21 @@ def test_to_numpy_na_raises(dtype): a.to_numpy(dtype=dtype) -def test_astype_str(): +def test_astype_str(using_infer_string): a = pd.array([1, 2, None], dtype="Int64") - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) + if using_infer_string: + expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_extension_array_equal(a.astype("str"), expected) + + # TODO(infer_string) this should also be a string array like above + expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") + tm.assert_numpy_array_equal(a.astype(str), expected) + else: + expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) def test_astype_boolean(): diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py index be87d5d3ef7ba..ef8701be81e2b 100644 --- a/pandas/tests/arrays/interval/test_interval_pyarrow.py +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -82,7 +80,6 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index ff86b696c8403..431309aca0df2 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,7 +1,5 @@ import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.dtypes import PeriodDtype @@ -79,7 +77,6 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_table_roundtrip(): from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -99,7 +96,6 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_load_from_zero_chunks(): # GH-41040 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b51b01c2b5168..1296cc3b5a494 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -65,7 +65,7 @@ def test_repr(dtype): assert repr(df) == expected if dtype.na_value is np.nan: - expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: str" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected @@ -75,10 +75,10 @@ def test_repr(dtype): expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" - expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" elif dtype.storage == "python" and dtype.na_value is np.nan: arr_name = "StringArrayNumpySemantics" - expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" else: arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" @@ -502,7 +502,7 @@ def test_fillna_args(dtype): tm.assert_extension_array_equal(res, expected) if dtype.storage == "pyarrow": - msg = "Invalid value '1' for dtype string" + msg = "Invalid value '1' for dtype str" else: msg = "Cannot set non-string value '1' into a StringArray." with pytest.raises(TypeError, match=msg): @@ -524,7 +524,7 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -539,14 +539,17 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") - tm.assert_frame_equal(result, expected) - # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is result["a"].dtype.na_value + if dtype.na_value is np.nan and not using_string_dtype(): + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage}]") + tm.assert_frame_equal(result, expected) + # ensure the missing value is represented by NA and not np.nan or None + assert result.loc[2, "a"] is result["a"].dtype.na_value -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 @@ -563,9 +566,13 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) with pd.option_context("string_storage", string_storage): result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") - tm.assert_frame_equal(result, expected) + + if dtype.na_value is np.nan and not using_string_dtype(): + assert result["a"].dtype == "object" + else: + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage}]") + tm.assert_frame_equal(result, expected) def test_value_counts_na(dtype): diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index d38b728aaf120..8d5c16e448cee 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -27,8 +28,9 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage == "python": - # python string storage with na_value=NaN is not yet implemented + if using_infer_string and string_storage == "python" and HAS_PYARROW: + # string storage with na_value=NaN always uses pyarrow if available + # -> does not yet honor the option request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) with pd.option_context("string_storage", string_storage): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 4961123a7ca07..360ab960088ed 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -295,7 +295,9 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings(self, arr1d, box, string_storage): + def test_searchsorted_castable_strings( + self, arr1d, box, string_storage, using_infer_string + ): arr = arr1d if box is None: pass @@ -331,7 +333,8 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage): TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got string array instead." + "or array of those. Got " + f"{'str' if using_infer_string else 'string'} array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c34c97b6e4f04..e0232bb292d6e 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -799,3 +799,22 @@ def test_pandas_dtype_ea_not_instance(): # GH 31356 GH 54592 with tm.assert_produces_warning(UserWarning): assert pandas_dtype(CategoricalDtype) == CategoricalDtype() + + +def test_pandas_dtype_string_dtypes(string_storage): + # TODO(infer_string) remove skip if "python" is supported + pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + # TODO(infer_string) hardcoded to pyarrow until python is supported + assert result == pd.StringDtype("pyarrow", na_value=np.nan) + + with pd.option_context("future.infer_string", False): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("str") + assert result == np.dtype("U") + + with pd.option_context("string_storage", string_storage): + result = pandas_dtype("string") + assert result == pd.StringDtype(string_storage, na_value=pd.NA) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index e522d2666a2dc..a4916ed1bbd8a 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1062,7 +1062,7 @@ def test_str_vs_repr(self, ordered, using_infer_string): c1 = CategoricalDtype(["a", "b"], ordered=ordered) assert str(c1) == "category" # Py2 will have unicode prefixes - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" pat = ( r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, " rf"categories_dtype={dtype}\)" diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 1102d9d941663..f800f734ec9d9 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -116,6 +116,20 @@ def test_is_not_string_type(self, dtype): # because StringDtype is a string type assert is_string_dtype(dtype) + def test_is_dtype_from_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + result = type(dtype).is_dtype(dtype.name) + assert result is False + else: + super().test_is_dtype_from_name(dtype) + + def test_construct_from_string_own_name(self, dtype, using_infer_string): + if dtype.na_value is np.nan and not using_infer_string: + with pytest.raises(TypeError, match="Cannot construct a 'StringDtype'"): + dtype.construct_from_string(dtype.name) + else: + super().test_construct_from_string_own_name(dtype) + def test_view(self, data): if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 09f359df37dd1..ec00044b84c49 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -337,7 +337,7 @@ def test_setitem( smaller["col10"] = ["1", "2"] if using_infer_string: - assert smaller["col10"].dtype == "string" + assert smaller["col10"].dtype == "str" else: assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() @@ -472,13 +472,13 @@ def test_setitem_corner(self, float_frame, using_infer_string): del dm["foo"] dm["foo"] = "bar" if using_infer_string: - assert dm["foo"].dtype == "string" + assert dm["foo"].dtype == "str" else: assert dm["foo"].dtype == np.object_ dm["coercible"] = ["1", "2", "3"] if using_infer_string: - assert dm["coercible"].dtype == "string" + assert dm["coercible"].dtype == "str" else: assert dm["coercible"].dtype == np.object_ @@ -514,7 +514,7 @@ def test_setitem_ambig(self, using_infer_string): dm[2] = uncoercable_series assert len(dm.columns) == 3 if using_infer_string: - assert dm[2].dtype == "string" + assert dm[2].dtype == "str" else: assert dm[2].dtype == np.object_ diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index ce771280bc264..3d23e13264911 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -28,7 +28,7 @@ def test_set_value_resize(self, float_frame, using_infer_string): res = float_frame.copy() res._set_value("foobar", "baz", "sam") if using_infer_string: - assert res["baz"].dtype == "string" + assert res["baz"].dtype == "str" else: assert res["baz"].dtype == np.object_ res = float_frame.copy() diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index ea9cc22d93758..9c27e76de91b2 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -202,7 +202,7 @@ def test_astype_dict_like(self, dtype_class): expected = DataFrame( { "a": a, - "b": Series(["0", "1", "2", "3", "4"], dtype="object"), + "b": Series(["0", "1", "2", "3", "4"], dtype="str"), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } @@ -263,9 +263,9 @@ def test_astype_duplicate_col(self): a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) - result = df.astype(str) + result = df.astype("str") a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") - b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") + b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype="str", name="b") a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_get_numeric_data.py b/pandas/tests/frame/methods/test_get_numeric_data.py index c5d32d56d03c1..6d097e75f6703 100644 --- a/pandas/tests/frame/methods/test_get_numeric_data.py +++ b/pandas/tests/frame/methods/test_get_numeric_data.py @@ -33,7 +33,9 @@ def test_get_numeric_data(self, using_infer_string): [ np.dtype("float64"), np.dtype("int64"), - np.dtype(objectname) if not using_infer_string else "string", + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype(datetime64name), ], index=["a", "b", "c", "f"], diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 3ba893501914a..54f2e45488b78 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -86,7 +86,7 @@ def test_nlargest_n(self, df_strings, nselect_method, n, order): df = df_strings if "b" in order: error_msg = ( - f"Column 'b' has dtype (object|string), " + f"Column 'b' has dtype (object|str), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 44d7bbf57fe0a..8d93c97b6b68a 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -664,7 +664,7 @@ def test_reset_index_dtypes_on_empty_frame_with_multiindex( idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes if using_infer_string and dtype == object: - dtype = "string" + dtype = pd.StringDtype(na_value=np.nan) expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) @@ -697,7 +697,7 @@ def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( expected["c3"] = expected["c3"].astype("datetime64[ns]") expected["c1"] = expected["c1"].astype("float64") if using_infer_string: - expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") + expected["c2"] = expected["c2"].astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index d1bee6a3de613..875dca321635f 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -50,7 +50,7 @@ def copy(self): class TestSelectDtypes: - def test_select_dtypes_include_using_list_like(self): + def test_select_dtypes_include_using_list_like(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -94,6 +94,11 @@ def test_select_dtypes_include_using_list_like(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include=["period"]) + if using_infer_string: + ri = df.select_dtypes(include=["str"]) + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -151,7 +156,7 @@ def test_select_dtypes_exclude_include_int(self, include): expected = df[["b", "c", "e"]] tm.assert_frame_equal(result, expected) - def test_select_dtypes_include_using_scalars(self): + def test_select_dtypes_include_using_scalars(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -187,6 +192,11 @@ def test_select_dtypes_include_using_scalars(self): with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include="period") + if using_infer_string: + ri = df.select_dtypes(include="str") + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_scalars(self): df = DataFrame( { @@ -347,7 +357,10 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) - def test_select_dtypes_str_raises(self, dtype, arg): + def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): + if using_infer_string and dtype == "str": + # this is tested below + pytest.skip("Selecting string columns works with future strings") df = DataFrame( { "a": list("abc"), diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index bed8b030bc72a..20a8e95f990ec 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -697,10 +697,7 @@ def test_to_csv_interval_index(self, using_infer_string): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = df.copy() - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) + expected.index = expected.index.astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 9bd61736624ca..0766e927a64a9 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -209,7 +209,9 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): expected = Series( [np.dtype("float64")] * 4 + [ - np.dtype("object") if not using_infer_string else "string", + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype("datetime64[us]"), np.dtype("timedelta64[us]"), ], diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index aab900f6eef47..c9eb2d5ca7be4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -265,7 +265,7 @@ def test_emptylike_constructor(self, emptylike, expected_index, expected_columns tm.assert_frame_equal(result, expected) def test_constructor_mixed(self, float_string_frame, using_infer_string): - dtype = "string" if using_infer_string else np.object_ + dtype = "str" if using_infer_string else np.object_ assert float_string_frame["foo"].dtype == dtype def test_constructor_cast_failure(self): @@ -789,7 +789,7 @@ def test_constructor_dict_cast(self, using_infer_string): frame = DataFrame(test_data) assert len(frame) == 3 - assert frame["B"].dtype == np.object_ if not using_infer_string else "string" + assert frame["B"].dtype == np.object_ if not using_infer_string else "str" assert frame["A"].dtype == np.float64 def test_constructor_dict_cast2(self): @@ -1209,7 +1209,7 @@ def test_constructor_scalar_inference(self, using_infer_string): assert df["bool"].dtype == np.bool_ assert df["float"].dtype == np.float64 assert df["complex"].dtype == np.complex128 - assert df["object"].dtype == np.object_ if not using_infer_string else "string" + assert df["object"].dtype == np.object_ if not using_infer_string else "str" def test_constructor_arrays_and_scalars(self): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True}) @@ -1292,7 +1292,7 @@ def test_constructor_list_of_lists(self, using_infer_string): # GH #484 df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"]) assert is_integer_dtype(df["num"]) - assert df["str"].dtype == np.object_ if not using_infer_string else "string" + assert df["str"].dtype == np.object_ if not using_infer_string else "str" # GH 4851 # list of 0-dim ndarrays @@ -1860,7 +1860,12 @@ def test_constructor_with_datetimes(self, using_infer_string): result = df.dtypes expected = Series( [np.dtype("int64")] - + [np.dtype(objectname) if not using_infer_string else "string"] * 2 + + [ + np.dtype(objectname) + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + * 2 + [np.dtype("M8[s]"), np.dtype("M8[us]")], index=list("ABCDE"), ) @@ -1882,7 +1887,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1904,7 +1913,11 @@ def test_constructor_with_datetimes(self, using_infer_string): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object") if not using_infer_string else "string"] + + [ + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan) + ] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -2124,7 +2137,9 @@ def test_constructor_for_list_with_dtypes(self, using_infer_string): [ np.dtype("int64"), np.dtype("float64"), - np.dtype("object") if not using_infer_string else "string", + np.dtype("object") + if not using_infer_string + else pd.StringDtype(na_value=np.nan), np.dtype("datetime64[ns]"), np.dtype("float64"), ], diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 75ef348b75deb..2c3e9c1d5e327 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -657,7 +657,11 @@ def test_unstack_dtypes(self, using_infer_string): df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes - dtype = "string" if using_infer_string else np.dtype("object") + dtype = ( + pd.StringDtype(na_value=np.nan) + if using_infer_string + else np.dtype("object") + ) expected = Series( [np.dtype("float64")] * 2 + [dtype] * 2, index=MultiIndex.from_arrays( diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index d8401a8b2ae3f..9fe9bca8abdc9 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -52,7 +52,7 @@ def test_to_xarray_index_types(self, index_flat, df, using_infer_string): # column names are lost expected = df.copy() expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.columns.name = None tm.assert_frame_equal(result.to_dataframe(), expected) @@ -81,7 +81,7 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string): result = result.to_dataframe() expected = df.copy() expected["f"] = expected["f"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.columns.name = None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 4972a6b3afa17..d91510d834e6c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -79,7 +77,7 @@ def test_apply_index_date(using_infer_string): tm.assert_frame_equal(result, expected) -def test_apply_index_date_object(using_infer_string): +def test_apply_index_date_object(): # GH 5789 # don't auto coerce dates ts = [ @@ -111,10 +109,7 @@ def test_apply_index_date_object(using_infer_string): 1.40750, 1.40649, ] - dtype = "string[pyarrow_numpy]" if using_infer_string else object - exp_idx = Index( - ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" - ) + exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date") expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(DeprecationWarning, match=msg): @@ -942,12 +937,11 @@ def test_func_returns_object(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike, using_infer_string): +def test_apply_datetime_issue(group_column_dtlike): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from @@ -958,8 +952,7 @@ def test_apply_datetime_issue(group_column_dtlike, using_infer_string): with tm.assert_produces_warning(DeprecationWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - dtype = "string" if using_infer_string else "object" - expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) + expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -1040,7 +1033,7 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(DeprecationWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes - dtype = "string" if using_infer_string else object + dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object expected = Series( [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1073dda954563..c70995de7b3b2 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -131,7 +131,7 @@ def f(x): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - dtype = "string[pyarrow_numpy]" if using_infer_string else object + dtype = "str" if using_infer_string else object expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4381b36b0b73a..dc1658e9acf3b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1216,7 +1216,7 @@ def test_groupby_complex_mean(): tm.assert_frame_equal(result, expected) -def test_groupby_complex_numbers(using_infer_string): +def test_groupby_complex_numbers(): # GH 17927 df = DataFrame( [ @@ -1225,11 +1225,10 @@ def test_groupby_complex_numbers(using_infer_string): {"a": 4, "b": 1}, ] ) - dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype=dtype), + columns=Index(["a"]), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -2097,7 +2096,7 @@ def get_categorical_invalid_expected(): idx = Index(lev, name=keys[0]) if using_infer_string: - columns = Index([], dtype="string[pyarrow_numpy]") + columns = Index([], dtype="str") else: columns = [] expected = DataFrame([], columns=columns, index=idx) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index ff4685b1e412d..029d322e4fdc3 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -181,6 +181,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + re.escape(f"agg function failed [how->{method},dtype->str]"), ] ) with pytest.raises(exception, match=msg): @@ -198,6 +199,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), + re.escape(f"agg function failed [how->{method},dtype->str]"), ] ) with pytest.raises(exception, match=msg): diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index b2f345e5e6f77..955e3be107f75 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -9,7 +9,6 @@ class TestIndexRendering: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_repr_is_valid_construction_code(self): # for the case of Index, where the repr is traditional rather than # stylized diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 8456e6a7acba5..b1180f2d7af14 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -851,7 +851,7 @@ def test_dtype_representation(using_infer_string): # GH#46900 pmidx = MultiIndex.from_arrays([[1], ["a"]], names=[("a", "b"), ("c", "d")]) result = pmidx.dtypes - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = Series( ["int64", exp], index=MultiIndex.from_tuples([("a", "b"), ("c", "d")]), diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 6eeaeb6711d03..17ca876487330 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -41,7 +41,7 @@ def test_get_dtypes(using_infer_string): names=["int", "string", "dt"], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "int": np.dtype("int64"), @@ -61,7 +61,7 @@ def test_get_dtypes_no_level_name(using_infer_string): pd.date_range("20200101", periods=2, tz="UTC"), ], ) - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( { "level_0": np.dtype("int64"), @@ -82,7 +82,7 @@ def test_get_dtypes_duplicate_level_names(using_infer_string): ], names=["A", "A", "A"], ).dtypes - exp = "object" if not using_infer_string else "string" + exp = "object" if not using_infer_string else pd.StringDtype(na_value=np.nan) expected = pd.Series( [np.dtype("int64"), exp, DatetimeTZDtype(tz="utc")], index=["A", "A", "A"], diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index ebf9dac715f8d..493a5be735d1a 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -170,6 +170,7 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: + # TODO(infer_string) parametrize over multiple string dtypes @pytest.mark.parametrize( "dtype", [ @@ -208,6 +209,7 @@ def test_slice_locs_negative_step(self, in_slice, expected, dtype): expected = Index(list(expected), dtype=dtype) tm.assert_index_equal(result, expected) + # TODO(infer_string) parametrize over multiple string dtypes @td.skip_if_no("pyarrow") def test_slice_locs_negative_step_oob(self): index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7eeb626d91dc8..4d02ec853e0da 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -79,7 +79,7 @@ def test_constructor_copy(self, using_infer_string): assert new_index.name == "name" if using_infer_string: tm.assert_extension_array_equal( - new_index.values, pd.array(arr, dtype="string[pyarrow_numpy]") + new_index.values, pd.array(arr, dtype="str") ) else: tm.assert_numpy_array_equal(arr, new_index.values) @@ -160,7 +160,7 @@ def test_constructor_from_frame_series_freq(self, using_infer_string): df = DataFrame(np.random.default_rng(2).random((5, 3))) df["date"] = dts result = DatetimeIndex(df["date"], freq="MS") - dtype = object if not using_infer_string else "string" + dtype = object if not using_infer_string else "str" assert df["date"].dtype == dtype expected.name = "date" tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 5508153322adb..fa5ec63dd32fe 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -588,7 +588,7 @@ def test_loc_nan_multiindex(using_infer_string): np.ones((1, 4)), index=Index( [np.nan], - dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + dtype="object" if not using_infer_string else "str", name="u3", ), columns=Index(["d1", "d2", "d3", "d4"]), diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d2c8454019a5e..908e95accfb0f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -294,7 +294,7 @@ def test_dups_fancy_indexing_only_missing_label(self, using_infer_string): with pytest.raises( KeyError, match=re.escape( - "\"None of [Index(['E'], dtype='string')] are in the [index]\"" + "\"None of [Index(['E'], dtype='str')] are in the [index]\"" ), ): dfnu.loc[["E"]] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 41431c0e2813b..34d827a209dae 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -63,12 +63,17 @@ def test_not_change_nan_loc(series, new_series, expected_ser): class TestLoc: - def test_none_values_on_string_columns(self): + def test_none_values_on_string_columns(self, using_infer_string): # Issue #32218 - df = DataFrame(["1", "2", None], columns=["a"], dtype="str") - + df = DataFrame(["1", "2", None], columns=["a"], dtype=object) assert df.loc[2, "a"] is None + df = DataFrame(["1", "2", None], columns=["a"], dtype="str") + if using_infer_string: + assert np.isnan(df.loc[2, "a"]) + else: + assert df.loc[2, "a"] is None + @pytest.mark.parametrize("kind", ["series", "frame"]) def test_loc_getitem_int(self, kind, request): # int label @@ -1460,7 +1465,7 @@ def test_loc_setitem_single_row_categorical(self, using_infer_string): result = df["Alpha"] expected = Series(categories, index=df.index, name="Alpha").astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) tm.assert_series_equal(result, expected) @@ -1635,7 +1640,7 @@ def test_loc_setitem_single_column_mixed(self, using_infer_string): df.loc[df.index[::2], "str"] = np.nan expected = Series( [np.nan, "qux", np.nan, "qux", np.nan], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ).values tm.assert_almost_equal(df["str"].values, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index ca551024b4c1f..5fcb71d0186a6 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -227,7 +227,7 @@ def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): { "x": Series( ["1", "2"], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ), "y": Series([np.nan, np.nan], dtype=object), } diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ce88bae6e02f2..30c5d3177c5a5 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -626,7 +626,7 @@ def _compare(old_mgr, new_mgr): mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(copy=True) - dtype = "string[pyarrow_numpy]" if using_infer_string else np.object_ + dtype = "str" if using_infer_string else np.object_ assert new_mgr.iget(0).dtype == dtype assert new_mgr.iget(1).dtype == dtype assert new_mgr.iget(2).dtype == dtype diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 7ecddb18a61ec..57091b268a9db 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -766,7 +766,7 @@ def test_to_excel_interval_no_labels(self, path, using_infer_string): df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) df.to_excel(path, sheet_name="test1") diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 53e819ac5eaff..1e47b3bc38737 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -75,7 +75,7 @@ def test_build_table_schema(self, df_schema, using_infer_string): "primaryKey": ["idx"], } if using_infer_string: - expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected result = build_table_schema(df_schema) assert "pandas_version" in result @@ -128,7 +128,7 @@ def test_multiindex(self, df_schema, using_infer_string): "type": "any", "extDtype": "string", } - expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"} + expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected df.index.names = ["idx0", None] @@ -311,7 +311,7 @@ def test_to_json(self, df_table, using_infer_string): ] if using_infer_string: - fields[2] = {"name": "B", "type": "any", "extDtype": "string"} + fields[2] = {"name": "B", "type": "any", "extDtype": "str"} schema = {"fields": fields, "primaryKey": ["idx"]} data = [ diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a1d2e93e7c523..cb94111aedffd 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -266,7 +266,7 @@ def test_roundtrip_categorical( expected = categorical_frame.copy() expected.index = expected.index.astype( - str if not using_infer_string else "string[pyarrow_numpy]" + str if not using_infer_string else "str" ) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON assert_json_roundtrip_equal(result, expected, orient) @@ -621,7 +621,7 @@ def test_blocks_compat_GH9037(self, using_infer_string): # JSON deserialisation always creates unicode strings df_mixed.columns = df_mixed.columns.astype( - np.str_ if not using_infer_string else "string[pyarrow_numpy]" + np.str_ if not using_infer_string else "str" ) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") @@ -706,7 +706,7 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string expected = string_series if using_infer_string and orient in ("split", "index", "columns"): # These schemas don't contain dtypes, so we infer string - expected.index = expected.index.astype("string[pyarrow_numpy]") + expected.index = expected.index.astype("str") if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -1492,7 +1492,6 @@ def test_from_json_to_json_table_index_and_columns(self, index, columns): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 55bd3f0d5a03a..7d0802dcf2e47 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, HDFStore, @@ -13,7 +15,10 @@ tables, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_keys(setup_path): diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index 03622faa2b5a8..bbe1cd77e0d9f 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Series, @@ -17,6 +19,7 @@ class TestHDFStoreSubclass: # GH 33748 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_supported_for_subclass_dataframe(self, tmp_path): data = {"a": [1, 2], "b": [3, 4]} sdf = tm.SubclassedDataFrame(data, dtype=np.intp) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 56707560c2fda..75ecd1d929d58 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -377,6 +377,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 65f4156cedf49..19b60e17d3a92 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -168,6 +168,7 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -277,7 +278,6 @@ def test_not_present_exception(): read_csv("memory://test/test.csv") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_feather_options(fsspectest): pytest.importorskip("pyarrow") df = DataFrame({"a": [0]}) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index a7ae9c7049702..96bc0326b23ab 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -197,6 +197,7 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index b2d96cb1d9133..a85576ff13f5c 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -4,8 +4,6 @@ import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserWarning import pandas.util._test_decorators as td @@ -85,7 +83,6 @@ def read_xml_iterparse(data, **kwargs): # DTYPE -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dtype_single_str(parser): df_result = read_xml(StringIO(xml_types), dtype={"degrees": "str"}, parser=parser) df_iter = read_xml_iterparse( @@ -211,7 +208,6 @@ def test_wrong_dtype(xml_books, parser, iterparse): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_both_dtype_converters(parser): df_expected = DataFrame( { diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index bbaaf0abecfbd..8e6a14e6bfb8f 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -59,9 +59,7 @@ def test_categorical_concat_dtypes(self, using_infer_string): num = Series([1, 2, 3]) df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - result = df.dtypes == ( - object if not using_infer_string else "string[pyarrow_numpy]" - ) + result = df.dtypes == (object if not using_infer_string else "str") expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 30ef0a934157b..9560087615123 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -27,7 +27,7 @@ def test_handle_empty_objects(self, sort, using_infer_string): expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) expected["foo"] = expected["foo"].astype( - object if not using_infer_string else "string[pyarrow_numpy]" + object if not using_infer_string else "str" ) expected.loc[0:4, "foo"] = "bar" @@ -284,7 +284,7 @@ def test_concat_empty_dataframe_different_dtypes(self, using_infer_string): result = concat([df1[:0], df2[:0]]) assert result["a"].dtype == np.int64 - assert result["b"].dtype == np.object_ if not using_infer_string else "string" + assert result["b"].dtype == np.object_ if not using_infer_string else "str" def test_concat_to_empty_ea(self): """48510 `concat` to an empty EA should maintain type EA dtype.""" diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 52bb9fa0f151b..49c94168d203e 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -452,9 +452,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): s1 = Series(["a", "b", "c"]) s2 = Series(["a", "b"]) s3 = Series(["a", "b", "c", "d"]) - s4 = Series( - [], dtype=object if not using_infer_string else "string[pyarrow_numpy]" - ) + s4 = Series([], dtype=object if not using_infer_string else "str") result = concat( [s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1 ) @@ -465,7 +463,7 @@ def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): ["c", np.nan] * 2, [np.nan] * 2 + ["d"] + [np.nan], ], - dtype=object if not using_infer_string else "string[pyarrow_numpy]", + dtype=object if not using_infer_string else "str", ) tm.assert_frame_equal( result, expected, check_index_type=True, check_column_type=True diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index db5a0437a14f0..91f0cf6c31085 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -156,7 +156,7 @@ def test_join_on(self, target_source, infer_string): # overlap source_copy = source.copy() msg = ( - "You are trying to merge on float64 and object|string columns for key " + "You are trying to merge on float64 and object|str columns for key " "'A'. If you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ed49f3b758cc5..8a9fe9f3e2cfd 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -826,7 +826,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|string'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|str'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -1877,7 +1877,7 @@ def test_identical(self, left, using_infer_string): # merging on the same, should preserve dtypes merged = merge(left, left, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [CategoricalDtype(categories=["foo", "bar"]), dtype, dtype], index=["X", "Y_x", "Y_y"], @@ -1889,7 +1889,7 @@ def test_basic(self, left, right, using_infer_string): # so should preserve the merged column merged = merge(left, right, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), @@ -2003,7 +2003,7 @@ def test_other_columns(self, left, right, using_infer_string): merged = merge(left, right, on="X") result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), @@ -2040,7 +2040,7 @@ def test_dtype_on_merged_different( merged = merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - dtype = np.dtype("O") if not using_infer_string else "string" + dtype = np.dtype("O") if not using_infer_string else "str" expected = Series([dtype, dtype, np.dtype("int64")], index=["X", "Y", "Z"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 0865e3cfa8149..11e29f4e10dc4 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3183,7 +3183,7 @@ def test_by_nullable(self, any_numeric_ea_dtype, using_infer_string): ) expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object) if using_infer_string: - expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) def test_merge_by_col_tz_aware(self): @@ -3234,7 +3234,7 @@ def test_by_mixed_tz_aware(self, using_infer_string): ) expected["value_y"] = np.array([np.nan], dtype=object) if using_infer_string: - expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") + expected["value_y"] = expected["value_y"].astype("str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["float64", "int16", "m8[ns]", "M8[us]"]) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index b12438b6327ad..6009b263a83c5 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -336,7 +336,7 @@ def test_no_prefix_string_cats_default_category( dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) if using_infer_string: - expected[""] = expected[""].astype("string[pyarrow_numpy]") + expected[""] = expected[""].astype("str") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 3d9b3a6d1c7a2..2c17b7f6a5a47 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -122,7 +122,7 @@ def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string): result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype) - key = "string" if using_infer_string else "object" + key = "str" if using_infer_string else "object" expected_counts = {"int64": 1, key: 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index f0803ac2f2a30..0dd2c227d6aa7 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -599,7 +599,7 @@ def test_strftime_period_days(self, using_infer_string): dtype="=U10", ) if using_infer_string: - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype("str") tm.assert_index_equal(result, expected) @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py index 3d1082c3d040b..7440ef2692c47 100644 --- a/pandas/tests/series/indexing/test_delitem.py +++ b/pandas/tests/series/indexing/test_delitem.py @@ -31,16 +31,15 @@ def test_delitem(self): del s[0] tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - def test_delitem_object_index(self, using_infer_string): + def test_delitem_object_index(self): # Index(dtype=object) - dtype = "string[pyarrow_numpy]" if using_infer_string else object - s = Series(1, index=Index(["a"], dtype=dtype)) + s = Series(1, index=Index(["a"], dtype="str")) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) s["a"] = 1 - tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype))) + tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype="str"))) del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="str"))) def test_delitem_missing_key(self): # empty diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 596a225c288b8..9783dcd2fea07 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -363,9 +363,7 @@ def test_getitem_no_matches(self, box): key = Series(["C"], dtype=object) key = box(key) - msg = ( - r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]" - ) + msg = r"None of \[Index\(\['C'\], dtype='object|str'\)\] are in the \[index\]" with pytest.raises(KeyError, match=msg): ser[key] diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index fb8e5c31929b2..e2c27fe5575db 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -624,7 +624,7 @@ def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string ser = Series(["a", "b"]) ser[3] = nulls_fixture dtype = ( - "string[pyarrow_numpy]" + "str" if using_infer_string and not isinstance(nulls_fixture, Decimal) else object ) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 4c8028e74ee55..ef0757ffe4aa8 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -538,12 +538,12 @@ def test_astype_categorical_to_other(self): expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) - msg = r"Cannot cast object|string dtype to float64" + msg = r"Cannot cast object|str dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) - exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype="str") tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int") diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 251d4063008b9..ac489b2579e05 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -101,16 +101,16 @@ def test_map_series_stringdtype(any_string_dtype, using_infer_string): expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) if using_infer_string and any_string_dtype == "object": - expected = expected.astype("string[pyarrow_numpy]") + expected = expected.astype("str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "data, expected_dtype", - [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], + [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], "str")], ) -def test_map_categorical_with_nan_values(data, expected_dtype, using_infer_string): +def test_map_categorical_with_nan_values(data, expected_dtype): # GH 20714 bug fixed in: GH 24275 def func(val): return val.split("-")[0] @@ -118,8 +118,6 @@ def func(val): s = Series(data, dtype="category") result = s.map(func, na_action="ignore") - if using_infer_string and expected_dtype == object: - expected_dtype = "string[pyarrow_numpy]" expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -145,9 +143,7 @@ def test_map_simple_str_callables_same_as_astype( # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.map(func) - expected = string_series.astype( - str if not using_infer_string else "string[pyarrow_numpy]" - ) + expected = string_series.astype(str if not using_infer_string else "str") tm.assert_series_equal(result, expected) @@ -497,7 +493,7 @@ def test_map_categorical(na_action, using_infer_string): result = s.map(lambda x: "A", na_action=na_action) exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object if not using_infer_string else "string" + assert result.dtype == object if not using_infer_string else "str" @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 119654bd19b3f..a8f3862d39f07 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -64,7 +64,7 @@ def test_rename_set_name_inplace(self, using_infer_string): assert ser.name == name exp = np.array(["a", "b", "c"], dtype=np.object_) if using_infer_string: - exp = array(exp, dtype="string[pyarrow_numpy]") + exp = array(exp, dtype="str") tm.assert_extension_array_equal(ser.index.values, exp) else: tm.assert_numpy_array_equal(ser.index.values, exp) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index 48e2608a1032a..fa571fa126b38 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -193,7 +193,7 @@ def test_reset_index_dtypes_on_empty_series_with_multiindex( # GH 19602 - Preserve dtype on empty Series with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = Series(dtype=object, index=idx)[:0].reset_index().dtypes - exp = "string" if using_infer_string else object + exp = "str" if using_infer_string else object expected = Series( { "level_0": np.int64, diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 999dd90d337d9..efb249fdedf3d 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -177,9 +177,6 @@ def test_to_csv_interval_index(self, using_infer_string): result = self.read_csv(path, index_col=0) # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) - expected = s.copy() - if using_infer_string: - expected.index = expected.index.astype("string[pyarrow_numpy]") - else: - expected.index = expected.index.astype(str) + expected = s + expected.index = expected.index.astype("str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 0aaa8ddcfda0c..6efe0bcb8b45d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -167,7 +167,7 @@ def test_constructor(self, datetime_series, using_infer_string): # Mixed type Series mixed = Series(["hello", np.nan], index=[0, 1]) - assert mixed.dtype == np.object_ if not using_infer_string else "string" + assert mixed.dtype == np.object_ if not using_infer_string else "str" assert np.isnan(mixed[1]) assert not empty_series.index._is_all_dates @@ -1469,7 +1469,7 @@ def test_fromDict(self, using_infer_string): data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) - assert series.dtype == np.object_ if not using_infer_string else "string" + assert series.dtype == np.object_ if not using_infer_string else "str" data = {"a": "0", "b": "1"} series = Series(data, dtype=float) @@ -1481,7 +1481,7 @@ def test_fromValue(self, datetime_series, using_infer_string): assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) - assert strings.dtype == np.object_ if not using_infer_string else "string" + assert strings.dtype == np.object_ if not using_infer_string else "str" assert len(strings) == len(datetime_series) d = datetime.now() @@ -2141,6 +2141,11 @@ def test_series_string_inference_storage_definition(self): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) + expected = Series(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) + with pd.option_context("future.infer_string", True): + result = Series(["a", "b"], dtype="str") + tm.assert_series_equal(result, expected) + def test_series_constructor_infer_string_scalar(self): # GH#55537 with pd.option_context("future.infer_string", True): diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index 4939f3221d268..77e77a9337d63 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -323,7 +323,7 @@ def test_categorical_repr(self, using_infer_string): "0 a\n1 b\n" " ..\n" "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, string): [a, b]" + "Length: 50, dtype: category\nCategories (2, str): [a, b]" ) else: exp = ( @@ -341,7 +341,7 @@ def test_categorical_repr(self, using_infer_string): exp = ( "0 a\n1 b\n" "dtype: category\n" - "Categories (26, string): [a < b < c < d ... w < x < y < z]" + "Categories (26, str): [a < b < c < d ... w < x < y < z]" ) else: exp = ( diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index bf88da04b73ff..51ce73ef54300 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -167,7 +165,6 @@ def test_pandas_datareader(): pytest.importorskip("pandas_datareader") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pyarrow(df): pyarrow = pytest.importorskip("pyarrow") diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 79132591b15b3..dd5218ab9404f 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -111,7 +111,7 @@ def test_empty_dtypes(check_dtype): @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""{obj_fixture}\\.index are different @@ -131,7 +131,7 @@ def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string) @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_columns_mismatch(check_like, obj_fixture, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""{obj_fixture}\\.columns are different diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index dc6efdcec380e..ab52d6c8e9f39 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -207,7 +207,7 @@ def test_index_equal_names(name1, name2): def test_index_equal_category_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Index are different diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 1878e7d838064..0d56885a1cb84 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -221,9 +221,9 @@ def test_series_equal_categorical_values_mismatch(rtol, using_infer_string): Series values are different \\(66\\.66667 %\\) \\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\['a', 'b', 'c'\\] -Categories \\(3, string\\): \\[a, b, c\\] +Categories \\(3, str\\): \\[a, b, c\\] \\[right\\]: \\['a', 'c', 'b'\\] -Categories \\(3, string\\): \\[a, b, c\\]""" +Categories \\(3, str\\): \\[a, b, c\\]""" else: msg = """Series are different @@ -258,7 +258,7 @@ def test_series_equal_datetime_values_mismatch(rtol): def test_series_equal_categorical_mismatch(check_categorical, using_infer_string): if using_infer_string: - dtype = "string" + dtype = "str" else: dtype = "object" msg = f"""Attributes of Series are different diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index fe2da210c6fe9..948565be36b5b 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -71,7 +71,7 @@ def test_sum_object_str_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3, step=step) with pytest.raises( - DataError, match="Cannot aggregate non-numeric type: object|string" + DataError, match="Cannot aggregate non-numeric type: object|str" ): # GH#42738, enforced in 2.0 r.sum() From 79f87d3033925cc0cbf1b17de993c3e55e26c0f3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 8 Aug 2024 17:25:05 +0200 Subject: [PATCH 211/396] String dtype: fix alignment sorting in case of python storage (#59448) * String dtype: fix alignment sorting in case of python storage * add test --- pandas/core/indexes/base.py | 5 ++++- pandas/tests/series/methods/test_align.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 11d2436b0e095..825316585c03c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5072,7 +5072,10 @@ def _can_use_libjoin(self) -> bool: return ( isinstance(self.dtype, np.dtype) or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) - or self.dtype == "string[python]" + or ( + isinstance(self.dtype, StringDtype) + and self.dtype.storage == "python" + ) ) # Exclude index types where the conversion to numpy converts to object dtype, # which negates the performance benefit of libjoin diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index cb60cd2e5bcf3..f332aad0c05f9 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -211,6 +211,19 @@ def test_align_periodindex(join_type): ts.align(ts[::2], join=join_type) +def test_align_stringindex(any_string_dtype): + left = Series(range(3), index=pd.Index(["a", "b", "d"], dtype=any_string_dtype)) + right = Series(range(3), index=pd.Index(["a", "b", "c"], dtype=any_string_dtype)) + result_left, result_right = left.align(right) + + expected_idx = pd.Index(["a", "b", "c", "d"], dtype=any_string_dtype) + expected_left = Series([0, 1, np.nan, 2], index=expected_idx) + expected_right = Series([0, 1, 2, np.nan], index=expected_idx) + + tm.assert_series_equal(result_left, expected_left) + tm.assert_series_equal(result_right, expected_right) + + def test_align_left_fewer_levels(): # GH#45224 left = Series([2], index=pd.MultiIndex.from_tuples([(1, 3)], names=["a", "c"])) From daa14c79aa9c2d34cd925c00ff99ec09097592ad Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 14 Aug 2024 12:17:31 -0400 Subject: [PATCH 212/396] TST (string dtype): add test build with future strings enabled without pyarrow (#59437) * TST (string dtype): add test build with future strings enabled without pyarrow * ensure the build doesn't override the default ones * uninstall -> remove * avoid jobs with same env being cancelled * use different python version for both future jobs * add some xfails * fixup xfails * less strict --- .github/actions/setup-conda/action.yml | 6 +++++ .github/workflows/unit-tests.yml | 7 +++++- pandas/tests/apply/test_frame_apply.py | 5 ++++ pandas/tests/apply/test_invalid_arg.py | 11 +++++++++ pandas/tests/apply/test_numba.py | 1 + pandas/tests/arithmetic/test_object.py | 6 +++++ .../tests/arrays/boolean/test_arithmetic.py | 7 ++++++ .../arrays/categorical/test_analytics.py | 9 +++++-- .../arrays/categorical/test_constructors.py | 6 ++++- pandas/tests/arrays/integer/test_reduction.py | 9 ++++--- pandas/tests/base/test_conversion.py | 12 +++++++++- pandas/tests/copy_view/test_astype.py | 5 ++-- pandas/tests/copy_view/test_functions.py | 12 ++++++---- pandas/tests/copy_view/test_interp_fillna.py | 4 +++- pandas/tests/copy_view/test_methods.py | 8 +++++-- pandas/tests/copy_view/test_replace.py | 6 +++-- pandas/tests/extension/base/ops.py | 24 +++++++++++++++++++ pandas/tests/extension/test_categorical.py | 2 ++ pandas/tests/extension/test_numpy.py | 7 ++++++ pandas/tests/frame/indexing/test_indexing.py | 5 +++- pandas/tests/frame/indexing/test_where.py | 8 +++++++ pandas/tests/frame/methods/test_info.py | 3 ++- pandas/tests/frame/methods/test_rank.py | 14 ++++++++++- .../tests/frame/methods/test_value_counts.py | 7 ++++++ pandas/tests/frame/test_api.py | 6 ++++- pandas/tests/frame/test_arithmetic.py | 5 +++- pandas/tests/frame/test_constructors.py | 3 ++- pandas/tests/frame/test_logical_ops.py | 7 ++++++ pandas/tests/frame/test_reductions.py | 11 ++++++--- pandas/tests/frame/test_unary.py | 10 +++++++- .../groupby/methods/test_value_counts.py | 4 ++++ pandas/tests/groupby/test_groupby.py | 5 ++++ pandas/tests/groupby/test_groupby_dropna.py | 19 +++++++++++++++ pandas/tests/indexes/object/test_indexing.py | 6 +++++ pandas/tests/indexes/test_base.py | 20 +++++++++++++++- pandas/tests/indexes/test_old_base.py | 6 +++++ pandas/tests/indexing/test_loc.py | 4 ++++ pandas/tests/io/formats/style/test_bar.py | 1 + pandas/tests/io/parser/conftest.py | 22 +++++++++++++++-- pandas/tests/reductions/test_reductions.py | 5 ++++ pandas/tests/series/indexing/test_setitem.py | 6 +++++ pandas/tests/series/test_api.py | 7 ++++++ pandas/tests/series/test_logical_ops.py | 5 ++++ pandas/tests/series/test_reductions.py | 13 ++++++++++ pandas/tests/window/test_rolling.py | 6 +++++ 45 files changed, 322 insertions(+), 33 deletions(-) diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index e31fed120267a..4fe901998cbcc 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -16,3 +16,9 @@ runs: condarc-file: ci/.condarc cache-environment: true cache-downloads: true + + - name: Uninstall pyarrow + if: ${{ env.REMOVE_PYARROW == '1' }} + run: | + micromamba remove -y pyarrow + shell: bash -el {0} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 3ba61e39316af..eef899173403b 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -29,6 +29,7 @@ jobs: env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] + pandas_future_infer_string: ["0"] include: - name: "Downstream Compat" env_file: actions-311-downstream_compat.yaml @@ -86,6 +87,9 @@ jobs: pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" - name: "Future infer strings" + env_file: actions-312.yaml + pandas_future_infer_string: "1" + - name: "Future infer strings (without pyarrow)" env_file: actions-311.yaml pandas_future_infer_string: "1" - name: "Pypy" @@ -114,9 +118,10 @@ jobs: NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }} # Clipboard tests QT_QPA_PLATFORM: offscreen + REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }}-${{ matrix.pandas_future_infer_string }} cancel-in-progress: true services: diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index a774ae214e09a..5e0f991d5c406 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -6,6 +6,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -1201,6 +1203,9 @@ def test_agg_multiple_mixed(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_agg_multiple_mixed_raises(): # GH 20909 mdf = DataFrame( diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index b5ad1094f5bf5..1c5b170c8753f 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -12,6 +12,9 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW from pandas.errors import SpecificationError from pandas import ( @@ -209,6 +212,10 @@ def transform(row): data.apply(transform, axis=1) +# we should raise a proper TypeError instead of propagating the pyarrow error +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) @pytest.mark.parametrize( "df, func, expected", tm.get_cython_table_params( @@ -229,6 +236,10 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_str df.agg(func, axis=axis) +# we should raise a proper TypeError instead of propagating the pyarrow error +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) @pytest.mark.parametrize( "series, func, expected", chain( diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 6ac0b49f0e4e7..6bbe5100e8826 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -104,6 +104,7 @@ def test_numba_nonunique_unsupported(apply_axis): def test_numba_unsupported_dtypes(apply_axis): + pytest.importorskip("pyarrow") f = lambda x: x df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]}) df["c"] = df["c"].astype("double[pyarrow]") diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 4b5156d0007bb..899ea1910d055 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,6 +8,9 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -315,6 +318,9 @@ def test_add(self): expected = pd.Index(["1a", "1b", "1c"]) tm.assert_index_equal("1" + index, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_sub_fail(self, using_infer_string): index = pd.Index([str(i) for i in range(10)]) diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 0c4fcf149eb20..4dbd8eb9f5ca7 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -3,6 +3,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd import pandas._testing as tm @@ -90,6 +94,9 @@ def test_op_int8(left_array, right_array, opname): # ----------------------------------------------------------------------------- +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): # invalid ops diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 7c7a236ab83cd..a38814ca43773 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -6,7 +6,10 @@ from pandas._config import using_string_dtype -from pandas.compat import PYPY +from pandas.compat import ( + HAS_PYARROW, + PYPY, +) from pandas import ( Categorical, @@ -298,7 +301,9 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) def test_memory_usage(self): cat = Categorical([1, 2, 3]) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 6813683cb5219..8ac479cf8a0a4 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -8,6 +8,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -449,7 +451,9 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") - @pytest.mark.xfail(using_string_dtype(), reason="Can't be NumPy strings") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="Can't be NumPy strings" + ) def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index db04862e4ea07..e485c7f79b475 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -102,9 +104,10 @@ def test_groupby_reductions(op, expected): ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], ) -def test_mixed_reductions(op, expected, using_infer_string): - if op in ["any", "all"] and using_infer_string: - expected = expected.astype("bool") +def test_mixed_reductions(request, op, expected, using_infer_string): + if op in ["any", "all"] and using_infer_string and HAS_PYARROW: + # TODO(infer_string) inconsistent result type + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) df = DataFrame( { "A": ["a", "b", "b"], diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index fe0f1f1454a55..d62599c56e467 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -20,6 +24,7 @@ SparseArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics @@ -218,7 +223,9 @@ def test_iter_box_period(self): ) def test_values_consistent(arr, expected_type, dtype, using_infer_string): if using_infer_string and dtype == "object": - expected_type = ArrowStringArrayNumpySemantics + expected_type = ( + ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArrayNumpySemantics + ) l_values = Series(arr)._values r_values = pd.Index(arr)._values assert type(l_values) is expected_type @@ -355,6 +362,9 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): tm.assert_numpy_array_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize( "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 514ee6410ecf1..fb82329d5b50d 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -5,6 +5,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -223,7 +224,7 @@ def test_astype_arrow_timestamp(using_copy_on_write): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_convert_dtypes_infer_objects(using_copy_on_write): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() @@ -243,7 +244,7 @@ def test_convert_dtypes_infer_objects(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_convert_dtypes(using_copy_on_write): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 4ec1d023c8ba7..a87baaedb9244 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -3,6 +3,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, Index, @@ -14,7 +16,7 @@ from pandas.tests.copy_view.util import get_array -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_concat_frames(using_copy_on_write): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -39,7 +41,7 @@ def test_concat_frames(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_concat_frames_updating_input(using_copy_on_write): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -201,7 +203,7 @@ def test_concat_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ @@ -319,7 +321,7 @@ def test_merge_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_join_on_key(using_copy_on_write): df_index = Index(["a", "b", "c"], name="key") @@ -353,7 +355,7 @@ def test_join_on_key(using_copy_on_write): tm.assert_frame_equal(df2, df2_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_join_multiple_dataframes_on_key(using_copy_on_write): df_index = Index(["a", "b", "c"], name="key") diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 338b76cbf1e7a..1f7f8689d0779 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -3,6 +3,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( NA, ArrowDtype, @@ -159,7 +161,7 @@ def test_interpolate_cleaned_fill_method(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_interpolate_object_convert_no_op(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) arr_a = get_array(df, "a") diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index d870342ef9e29..295d93580f451 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -3,6 +3,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import SettingWithCopyWarning import pandas as pd @@ -952,7 +953,7 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_infer_objects(using_copy_on_write): df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) df_orig = df.copy() @@ -974,6 +975,9 @@ def test_infer_objects(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_infer_objects_no_reference(using_copy_on_write): df = DataFrame( { @@ -1180,7 +1184,7 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("decimals", [-1, 0, 1]) def test_round(using_copy_on_write, warn_copy_on_write, decimals): df = DataFrame({"a": [1, 2], "b": "c"}) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 32da870c6d2e3..bc3edb1f72214 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -3,6 +3,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( Categorical, DataFrame, @@ -66,7 +68,7 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): assert np.shares_memory(arr, get_array(df, "a")) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_replace_regex_inplace(using_copy_on_write): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") @@ -354,7 +356,7 @@ def test_replace_empty_list(using_copy_on_write): assert not df2._mgr._has_no_reference(0) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(using_copy_on_write, value): df = DataFrame({"a": ["a", "b", "c"]}) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index fad2560265d21..ff9f3cbed64a2 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -7,6 +7,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.common import is_string_dtype import pandas as pd @@ -140,6 +142,12 @@ class BaseArithmeticOpsTests(BaseOpsUtil): series_array_exc: type[Exception] | None = TypeError divmod_exc: type[Exception] | None = TypeError + # TODO(infer_string) need to remove import of pyarrow + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -149,6 +157,11 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): ser = pd.Series(data) self.check_opname(ser, op_name, ser.iloc[0]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -158,12 +171,22 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): df = pd.DataFrame({"A": data}) self.check_opname(df, op_name, data[0]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op_name = all_arithmetic_operators ser = pd.Series(data) self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser))) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_divmod(self, data): ser = pd.Series(data) self._check_divmod_op(ser, divmod, 1) @@ -179,6 +202,7 @@ def test_divmod_series_array(self, data, data_for_twos): other = pd.Series(other) self._check_divmod_op(other, ops.rdivmod, ser) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_add_series_with_extension_array(self, data): # Check adding an ExtensionArray to a Series of the same dtype matches # the behavior of adding the arrays directly and then wrapping in a diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 135ea67c924d0..fd291908a4f96 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -148,6 +148,7 @@ def test_map(self, data, na_action): result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): # frame & scalar op_name = all_arithmetic_operators @@ -159,6 +160,7 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): ) super().test_arith_frame_with_scalar(data, op_name) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): op_name = all_arithmetic_operators if op_name == "__rmod__": diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index e38144f4c615b..b79b0a98efde4 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import NumpyEADtype import pandas as pd @@ -242,6 +244,7 @@ def test_insert_invalid(self, data, invalid_scalar): frame_scalar_exc = None series_array_exc = None + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod(self, data): divmod_exc = None if data.dtype.kind == "O": @@ -249,6 +252,7 @@ def test_divmod(self, data): self.divmod_exc = divmod_exc super().test_divmod(data) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod_series_array(self, data): ser = pd.Series(data) exc = None @@ -257,6 +261,7 @@ def test_divmod_series_array(self, data): self.divmod_exc = exc self._check_divmod_op(ser, divmod, data) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators series_scalar_exc = None @@ -270,6 +275,7 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) self.series_scalar_exc = series_scalar_exc super().test_arith_series_with_scalar(data, all_arithmetic_operators) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_array(self, data, all_arithmetic_operators): opname = all_arithmetic_operators series_array_exc = None @@ -278,6 +284,7 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): self.series_array_exc = series_array_exc super().test_arith_series_with_array(data, all_arithmetic_operators) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators frame_scalar_exc = None diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index ec00044b84c49..795e6b974ca34 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -12,6 +12,7 @@ from pandas._config import using_string_dtype from pandas._libs import iNaT +from pandas.compat import HAS_PYARROW from pandas.errors import ( InvalidIndexError, PerformanceWarning, @@ -1208,7 +1209,9 @@ def test_loc_setitem_datetimelike_with_inference(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index dfbc3b4ca33ad..f0d868a4cb583 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -6,6 +6,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -983,6 +985,9 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): obj.mask(mask, null) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) @given(data=OPTIONAL_ONE_OF_ALL) def test_where_inplace_casting(data): # GH 22051 @@ -1079,6 +1084,9 @@ def test_where_producing_ea_cond_for_np_dtype(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 475632667a87a..4594f725b43d5 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -10,6 +10,7 @@ from pandas._config import using_string_dtype from pandas.compat import ( + HAS_PYARROW, IS64, PYPY, ) @@ -520,7 +521,7 @@ def test_info_int_columns(): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8d7a0b373f5f8..edba971408d04 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -6,10 +6,13 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.algos import ( Infinity, NegInfinity, ) +from pandas.compat import HAS_PYARROW from pandas import ( DataFrame, @@ -471,9 +474,18 @@ def test_rank_inf_nans_na_option( ], ) def test_rank_object_first( - self, frame_or_series, na_option, ascending, expected, using_infer_string + self, + request, + frame_or_series, + na_option, + ascending, + expected, + using_infer_string, ): obj = frame_or_series(["foo", "foo", None, "foo"]) + if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series): + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) if using_infer_string and isinstance(obj, Series): diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 4136d641ef67f..7670b53f23173 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd import pandas._testing as tm @@ -132,6 +136,9 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) def test_data_frame_value_counts_dropna_false(nulls_fixture): # GH 41334 df = pd.DataFrame( diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 339800538f47b..f6c7bd1f49b27 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -8,6 +8,8 @@ from pandas._config import using_string_dtype from pandas._config.config import option_context +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -113,7 +115,9 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) - @pytest.mark.xfail(using_string_dtype(), reason="surrogates not allowed") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="surrogates not allowed" + ) def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 0407388d61f51..6b4efc41aeffa 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -13,6 +13,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -1562,7 +1563,9 @@ def test_comparisons(self, simple_frame, float_frame, func): with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): # GH 11565 df = DataFrame( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c9eb2d5ca7be4..86d9dc0c7fbdc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,6 +24,7 @@ from pandas._config import using_string_dtype from pandas._libs import lib +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -327,7 +328,7 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") + @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index 16ca3a202f1e0..2684704f86b82 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -4,6 +4,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + from pandas import ( CategoricalIndex, DataFrame, @@ -96,6 +100,9 @@ def test_logical_ops_int_frame(self): res_ser = df1a_int["A"] | df1a_bool["A"] tm.assert_series_equal(res_ser, df1a_bool["A"]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_logical_ops_invalid(self, using_infer_string): # GH#5808 diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index db15461ba0234..a64f06f6c0521 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -226,6 +226,7 @@ def float_frame_with_na(): class TestDataFrameAnalytics: # --------------------------------------------------------------------- # Reductions + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "opname", @@ -431,6 +432,7 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis): expected[expected.isna()] = None tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 @@ -534,7 +536,7 @@ def test_mean_mixed_string_decimal(self): df = DataFrame(d) with pytest.raises( - TypeError, match="unsupported operand type|does not support" + TypeError, match="unsupported operand type|does not support|Cannot perform" ): df.mean() result = df[["A", "C"]].mean() @@ -692,6 +694,7 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_mode_sortwarning(self, using_infer_string): # Check for the warning that is raised when the mode # results cannot be sorted @@ -990,7 +993,7 @@ def test_sum_mixed_datetime(self): def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - msg = "Could not convert|does not support" + msg = "Could not convert|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): float_string_frame.mean(axis=0) @@ -1103,6 +1106,7 @@ def test_idxmin_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmin_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) @@ -1153,6 +1157,7 @@ def test_idxmax_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmax_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) @@ -1994,7 +1999,7 @@ def test_minmax_extensionarray(method, numeric_only): def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) - with pytest.raises(TypeError, match="does not support reduction"): + with pytest.raises(TypeError, match="does not support operation|Cannot perform"): df.sum() diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 6f7453d0d1655..8e1df679ee1b4 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -5,6 +5,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -43,6 +44,11 @@ def test_neg_object(self, df, expected): tm.assert_frame_equal(-df, expected) tm.assert_series_equal(-df["a"], expected["a"]) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) @pytest.mark.parametrize( "df", [ @@ -128,7 +134,9 @@ def test_pos_object(self, df): tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" + ) @pytest.mark.parametrize( "df", [ diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index dcc0b39f0006c..51232fac7d6f6 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -10,6 +10,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas import ( @@ -536,6 +537,9 @@ def names_with_nulls_df(nulls_fixture): ) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, expected_data, expected_index", [ diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index dc1658e9acf3b..10f45cac1ff66 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -8,6 +8,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import ( PerformanceWarning, SpecificationError, @@ -1743,6 +1744,10 @@ def g(group): tm.assert_series_equal(result, expected) +# TODO harmonize error messages +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) def test_set_group_name(df, grouper, using_infer_string): def f(group): diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 9c01e017dd29c..d843a992daee0 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -3,6 +3,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -12,6 +13,9 @@ from pandas.tests.groupby import get_groupby_method_args +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -55,6 +59,9 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -131,6 +138,9 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, idx, expected", [ @@ -206,6 +216,9 @@ def test_groupby_dataframe_slice_then_transform(dropna, index): tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -287,6 +300,9 @@ def test_groupby_dropna_datetime_like_data( tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) @pytest.mark.parametrize( "dropna, data, selected_data, levels", [ @@ -372,6 +388,9 @@ def test_groupby_dropna_with_multiindex_input(input_index, keys, series): tm.assert_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_groupby_nan_included(): # GH 35646 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 493a5be735d1a..add2f3f18b348 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -3,10 +3,13 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.missing import ( NA, is_matching_na, ) +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -28,6 +31,9 @@ def test_get_indexer_strings(self, method, expected): tm.assert_numpy_array_equal(actual, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_get_indexer_strings_raises(self, using_infer_string): index = Index(["b", "c"]) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 4d02ec853e0da..cf75f95d17b0a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -8,7 +8,12 @@ import numpy as np import pytest -from pandas.compat import IS64 +from pandas._config import using_string_dtype + +from pandas.compat import ( + HAS_PYARROW, + IS64, +) from pandas.errors import InvalidIndexError import pandas.util._test_decorators as td @@ -71,6 +76,9 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_constructor_copy(self, using_infer_string): index = Index(list("abc"), name="name") arr = np.array(index) @@ -338,6 +346,11 @@ def test_constructor_empty_special(self, empty, klass): def test_view_with_args(self, index): index.view("i8") + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) @pytest.mark.parametrize( "index", [ @@ -856,6 +869,11 @@ def test_isin(self, values, index, expected): result = index.isin(values) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_isin_nan_common_object( self, nulls_fixture, nulls_fixture2, using_infer_string ): diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index a2256322d968b..8d859a61a2bd5 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -9,6 +9,7 @@ from pandas._config import using_string_dtype from pandas._libs.tslibs import Timestamp +from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ( is_integer_dtype, @@ -249,6 +250,11 @@ def test_repr_max_seq_item_setting(self, simple_index): repr(idx) assert "..." not in str(idx) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 34d827a209dae..4d2d1e336ef07 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -15,6 +15,7 @@ from pandas._config import using_string_dtype from pandas._libs import index as libindex +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -1454,6 +1455,9 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) diff --git a/pandas/tests/io/formats/style/test_bar.py b/pandas/tests/io/formats/style/test_bar.py index b0e4712e8bb3d..d28c7c566d851 100644 --- a/pandas/tests/io/formats/style/test_bar.py +++ b/pandas/tests/io/formats/style/test_bar.py @@ -347,6 +347,7 @@ def test_styler_bar_with_NA_values(): def test_style_bar_with_pyarrow_NA_values(): + pytest.importorskip("pyarrow") data = """name,age,test1,test2,teacher Adam,15,95.0,80,Ashby Bob,16,81.0,82,Ashby diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 6d5f870f07206..90f77a7024235 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -4,6 +4,7 @@ import pytest +from pandas.compat import HAS_PYARROW from pandas.compat._optional import VERSIONS from pandas import ( @@ -117,7 +118,15 @@ def csv1(datapath): _py_parsers_only = [_pythonParser] _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] -_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)] +_pyarrow_parsers_only = [ + pytest.param( + _pyarrowParser, + marks=[ + pytest.mark.single_cpu, + pytest.mark.skipif(not HAS_PYARROW, reason="pyarrow is not installed"), + ], + ) +] _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] @@ -181,7 +190,16 @@ def _get_all_parser_float_precision_combinations(): parser = parser.values[0] for precision in parser.float_precision_choices: # Re-wrap in pytest.param for pyarrow - mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else () + mark = ( + [ + pytest.mark.single_cpu, + pytest.mark.skipif( + not HAS_PYARROW, reason="pyarrow is not installed" + ), + ] + if parser.engine == "pyarrow" + else () + ) param = pytest.param((parser(), precision), marks=mark) params.append(param) ids.append(f"{parser_id}-{precision}") diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index cb9fd9e8da0df..c58db25991510 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -9,6 +9,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( Categorical, @@ -1244,6 +1246,9 @@ def test_idxminmax_object_dtype(self, using_infer_string): with pytest.raises(TypeError, match=msg): ser3.idxmin(skipna=False) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_idxminmax_object_frame(self): # GH#4279 df = DataFrame([["zimm", 2.5], ["biff", 1.0], ["bid", 12.0]]) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index e2c27fe5575db..0b9b1fc080cfe 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -10,6 +10,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import ( np_version_gt2, np_version_gte1p24, @@ -850,6 +851,11 @@ def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): indexer_sli(obj)[mask] = val tm.assert_series_equal(obj, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, + reason="TODO(infer_string)", + strict=False, + ) def test_series_where(self, obj, key, expected, warn, val, is_inplace): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 29d6e2036476e..e53cd753a4192 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -4,6 +4,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -167,6 +171,9 @@ def test_attrs(self): result = s + 1 assert result.attrs == {"version": 1} + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_inspect_getmembers(self): # GH38782 pytest.importorskip("jinja2") diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 197ef47759bf3..d3a718be47c18 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -6,6 +6,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, Index, @@ -148,6 +150,9 @@ def test_logical_operators_int_dtype_with_bool(self): expected = Series([False, True, True, True]) tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_logical_operators_int_dtype_with_object(self, using_infer_string): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 76353ab25fca6..fcae835e4c3e2 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import Series import pandas._testing as tm @@ -163,6 +167,9 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_mean_with_convertible_string_raises(using_array_manager, using_infer_string): # GH#44008 ser = Series(["1", "2"]) @@ -183,6 +190,9 @@ def test_mean_with_convertible_string_raises(using_array_manager, using_infer_st df.mean() +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_mean_dont_convert_j_to_complex(using_array_manager): # GH#36703 df = pd.DataFrame([{"db": "J", "numeric": 123}]) @@ -204,6 +214,9 @@ def test_mean_dont_convert_j_to_complex(using_array_manager): np.mean(df["db"].astype("string").array) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_median_with_convertible_string_raises(using_array_manager): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index f353a7fa2f0fe..acf636616421f 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -6,7 +6,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( + HAS_PYARROW, IS64, is_platform_arm, is_platform_power, @@ -1420,6 +1423,9 @@ def test_rolling_corr_timedelta_index(index, window): tm.assert_almost_equal(result, expected) +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" +) def test_groupby_rolling_nan_included(): # GH 35542 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} From a57d6748e4b5dadae2a37ff919d775eb8d99076e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 9 Aug 2024 10:18:41 -0700 Subject: [PATCH 213/396] REF (string dtype): de-duplicate _str_map (2) (#59451) * REF (string): de-duplicate _str_map (2) * mypy fixup --- pandas/core/arrays/string_.py | 179 +++++++++++++++-------------- pandas/core/arrays/string_arrow.py | 56 +-------- 2 files changed, 92 insertions(+), 143 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0929791ded58c..1919fdce12f11 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -342,6 +342,57 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: raise ValueError return cls._from_sequence(scalars, dtype=dtype) + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if self.dtype.na_value is np.nan: + return self._str_map_nan_semantics( + f, na_value=na_value, dtype=dtype, convert=convert + ) + + from pandas.arrays import BooleanArray + + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: type[IntegerArray | BooleanArray] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + elif dtype == np.dtype("bool"): + # GH#55736 + na_value = bool(na_value) + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + # error: Argument 1 to "dtype" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected + # "Type[object]" + dtype=np.dtype(cast(type, dtype)), + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + def _str_map_str_or_object( self, dtype, @@ -373,6 +424,45 @@ def _str_map_str_or_object( # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) + def _str_map_nan_semantics( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + convert = convert and not np.all(mask) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + if is_integer_dtype(dtype): + na_value = 0 + else: + na_value = True + + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(cast(type, dtype)), + ) + if na_value_is_na and mask.any(): + if is_integer_dtype(dtype): + result = result.astype("float64") + else: + result = result.astype("object") + result[mask] = np.nan + return result + + else: + return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" @@ -727,95 +817,6 @@ def _cmp_method(self, other, op): # base class "NumpyExtensionArray" defined the type as "float") _str_na_value = libmissing.NA # type: ignore[assignment] - def _str_map_nan_semantics( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - convert = convert and not np.all(mask) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - na_value_is_na = isna(na_value) - if na_value_is_na: - if is_integer_dtype(dtype): - na_value = 0 - else: - na_value = True - - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(cast(type, dtype)), - ) - if na_value_is_na and mask.any(): - if is_integer_dtype(dtype): - result = result.astype("float64") - else: - result = result.astype("object") - result[mask] = np.nan - return result - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if self.dtype.na_value is np.nan: - return self._str_map_nan_semantics( - f, na_value=na_value, dtype=dtype, convert=convert - ) - - from pandas.arrays import BooleanArray - - if dtype is None: - dtype = StringDtype(storage="python") - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray | BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - elif dtype == np.dtype("bool"): - na_value = bool(na_value) - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - class StringArrayNumpySemantics(StringArray): _storage = "python" diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 607f6f7e4246a..4bdbf1f6a606f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -283,6 +283,8 @@ def _data(self): # base class "ObjectStringArrayMixin" defined the type as "float") _str_na_value = libmissing.NA # type: ignore[assignment] + _str_map = BaseStringArray._str_map + def _str_map_nan_semantics( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True ): @@ -322,60 +324,6 @@ def _str_map_nan_semantics( else: return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if self.dtype.na_value is np.nan: - return self._str_map_nan_semantics( - f, na_value=na_value, dtype=dtype, convert=convert - ) - - # TODO: de-duplicate with StringArray method. This method is moreless copy and - # paste. - - from pandas.arrays import ( - BooleanArray, - IntegerArray, - ) - - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray | BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True ): From 138140d129cb9d4cc653230f2bde000bbb786ea8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 9 Aug 2024 14:13:06 -0700 Subject: [PATCH 214/396] REF (string): de-duplicate str_map_nan_semantics (#59464) REF: de-duplicate str_map_nan_semantics --- pandas/core/arrays/string_.py | 9 ++++--- pandas/core/arrays/string_arrow.py | 42 ------------------------------ 2 files changed, 5 insertions(+), 46 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1919fdce12f11..f2811703cbecf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -391,7 +391,7 @@ def _str_map( return constructor(result, mask) else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) def _str_map_str_or_object( self, @@ -400,7 +400,6 @@ def _str_map_str_or_object( arr: np.ndarray, f, mask: npt.NDArray[np.bool_], - convert: bool, ): # _str_map helper for case where dtype is either string dtype or object if is_string_dtype(dtype) and not is_object_dtype(dtype): @@ -434,7 +433,6 @@ def _str_map_nan_semantics( mask = isna(self) arr = np.asarray(self) - convert = convert and not np.all(mask) if is_integer_dtype(dtype) or is_bool_dtype(dtype): na_value_is_na = isna(na_value) @@ -453,6 +451,9 @@ def _str_map_nan_semantics( dtype=np.dtype(cast(type, dtype)), ) if na_value_is_na and mask.any(): + # TODO: we could alternatively do this check before map_infer_mask + # and adjust the dtype/na_value we pass there. Which is more + # performant? if is_integer_dtype(dtype): result = result.astype("float64") else: @@ -461,7 +462,7 @@ def _str_map_nan_semantics( return result else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4bdbf1f6a606f..c643d4fed4b20 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -7,7 +7,6 @@ TYPE_CHECKING, Callable, Union, - cast, ) import warnings @@ -24,8 +23,6 @@ from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( - is_bool_dtype, - is_integer_dtype, is_scalar, pandas_dtype, ) @@ -285,45 +282,6 @@ def _data(self): _str_map = BaseStringArray._str_map - def _str_map_nan_semantics( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - if is_integer_dtype(dtype): - na_value = np.nan - else: - na_value = False - - dtype = np.dtype(cast(type, dtype)) - if mask.any(): - # numpy int/bool dtypes cannot hold NaNs so we must convert to - # float64 for int (to match maybe_convert_objects) or - # object for bool (again to match maybe_convert_objects) - if is_integer_dtype(dtype): - dtype = np.dtype("float64") - else: - dtype = np.dtype(object) - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=dtype, - ) - return result - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True ): From 3a362d8f2af02efad2d50ef1512a619839f1a0a0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Aug 2024 19:14:08 +0200 Subject: [PATCH 215/396] BUG (string dtype): convert dictionary input to materialized string array in ArrowStringArray constructor (#59479) --- pandas/core/arrays/string_arrow.py | 16 ++++++++++------ pandas/tests/arrays/string_/test_string_arrow.py | 11 +++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c643d4fed4b20..7119f6321a7ff 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -125,18 +125,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr def __init__(self, values) -> None: _chk_pyarrow_available() - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( - values.type + if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( + pa.types.is_string(values.type) + or ( + pa.types.is_dictionary(values.type) + and ( + pa.types.is_string(values.type.value_type) + or pa.types.is_large_string(values.type.value_type) + ) + ) ): values = pc.cast(values, pa.large_string()) super().__init__(values) self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) - if not pa.types.is_large_string(self._pa_array.type) and not ( - pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_large_string(self._pa_array.type.value_type) - ): + if not pa.types.is_large_string(self._pa_array.type): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of " "large_string type" diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 8d5c16e448cee..6bab04e95de9e 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -88,19 +88,18 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): ArrowStringArray(arr) -@pytest.mark.xfail( - reason="dict conversion does not seem to be implemented for large string in arrow" -) +@pytest.mark.parametrize("string_type", ["string", "large_string"]) @pytest.mark.parametrize("chunked", [True, False]) -def test_constructor_valid_string_type_value_dictionary(chunked): +def test_constructor_valid_string_type_value_dictionary(string_type, chunked): pa = pytest.importorskip("pyarrow") - arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() + arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode() if chunked: arr = pa.chunked_array(arr) arr = ArrowStringArray(arr) - assert pa.types.is_string(arr._pa_array.type.value_type) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) def test_constructor_from_list(): From 3a03c6111800f7343e1ef577366e073880ca8983 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Aug 2024 19:33:35 +0200 Subject: [PATCH 216/396] String dtype: fix convert_dtypes() to convert NaN-string to NA-string (#59470) * String dtype: fix convert_dtypes() to convert NaN-string to NA-string * fix CoW tracking for conversion to python storage strings * remove xfails --- pandas/core/dtypes/cast.py | 10 +++++++++- pandas/core/internals/blocks.py | 9 +++++++-- pandas/tests/frame/methods/test_convert_dtypes.py | 10 +--------- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 -- pandas/tests/series/methods/test_convert_dtypes.py | 6 +++--- 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9580ab1b520e0..7a92b7306beea 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1025,6 +1025,8 @@ def convert_dtypes( ------- np.dtype, or ExtensionDtype """ + from pandas.core.arrays.string_ import StringDtype + inferred_dtype: str | DtypeObj if ( @@ -1103,12 +1105,18 @@ def convert_dtypes( # If we couldn't do anything else, then we retain the dtype inferred_dtype = input_array.dtype + elif ( + convert_string + and isinstance(input_array.dtype, StringDtype) + and input_array.dtype.na_value is np.nan + ): + inferred_dtype = pandas_dtype_func("string") + else: inferred_dtype = input_array.dtype if dtype_backend == "pyarrow": from pandas.core.arrays.arrow.array import to_pyarrow_type - from pandas.core.arrays.string_ import StringDtype assert not isinstance(inferred_dtype, str) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2f448bf249a2e..cd1639188b1ad 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -657,8 +657,13 @@ def convert( convert_non_numeric=True, ) refs = None - if copy and res_values is values: - res_values = values.copy() + if ( + copy + and res_values is values + or isinstance(res_values, NumpyExtensionArray) + and res_values._ndarray is values + ): + res_values = res_values.copy() elif res_values is values: refs = self.refs diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 59779234b46d9..e7f6e5d625d3e 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,21 +3,15 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm class TestConvertDtypes: - # TODO convert_dtypes should not use NaN variant of string dtype, but always NA - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes( - self, convert_integer, expected, string_storage, using_infer_string - ): + def test_convert_dtypes(self, convert_integer, expected, string_storage): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here df = pd.DataFrame( @@ -182,7 +176,6 @@ def test_convert_dtypes_pyarrow_timestamp(self): result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_avoid_block_splitting(self): # GH#55341 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) @@ -197,7 +190,6 @@ def test_convert_dtypes_avoid_block_splitting(self): tm.assert_frame_equal(result, expected) assert result._mgr.nblocks == 2 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_from_arrow(self): # GH#56581 df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index da17999bba4ca..5138dfb61eaac 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -463,7 +463,6 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -507,7 +506,6 @@ def test_dtype_backend_ea_dtype_specified(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_pyarrow(all_parsers, request): # GH#36712 pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 46fed9032c13d..c2cc838619790 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -230,9 +230,9 @@ def test_convert_dtypes( and params[0] and not params[1] ): - # If we would convert with convert strings then infer_objects converts - # with the option - expected_dtype = "string[pyarrow_numpy]" + # If convert_string=False and infer_objects=True, we end up with the + # default string dtype instead of preserving object for string data + expected_dtype = pd.StringDtype(na_value=np.nan) expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) From 13ad111898b6d570fe4962d5b1cbe96327a4484c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 13 Aug 2024 00:28:50 +0200 Subject: [PATCH 217/396] String dtype: honor mode.string_storage option (and change default to None) (#59488) * String dtype: honor mode.string_storage option (and change default to None) * fix test + explicitly test default * use 'auto' instead of None --- pandas/core/arrays/string_.py | 12 ++++++++---- pandas/core/config_init.py | 7 +++---- pandas/tests/arrays/string_/test_string_arrow.py | 10 ++++------ pandas/tests/dtypes/test_common.py | 13 +++++++++---- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f2811703cbecf..c881437ba25af 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -136,12 +136,16 @@ def __init__( # infer defaults if storage is None: if na_value is not libmissing.NA: - if HAS_PYARROW: - storage = "pyarrow" - else: - storage = "python" + storage = get_option("mode.string_storage") + if storage == "auto": + if HAS_PYARROW: + storage = "pyarrow" + else: + storage = "python" else: storage = get_option("mode.string_storage") + if storage == "auto": + storage = "python" if storage == "pyarrow_numpy": # TODO raise a deprecation warning diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 4cd7e50f0ec50..a1df455eebacf 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -505,13 +505,12 @@ def use_inf_as_na_cb(key) -> None: string_storage_doc = """ : string - The default storage for StringDtype. This option is ignored if - ``future.infer_string`` is set to True. + The default storage for StringDtype. """ def is_valid_string_storage(value: Any) -> None: - legal_values = ["python", "pyarrow"] + legal_values = ["auto", "python", "pyarrow"] if value not in legal_values: msg = "Value must be one of python|pyarrow" if value == "pyarrow_numpy": @@ -526,7 +525,7 @@ def is_valid_string_storage(value: Any) -> None: with cf.config_prefix("mode"): cf.register_option( "string_storage", - "python", + "auto", string_storage_doc, # validator=is_one_of_factory(["python", "pyarrow"]), validator=is_valid_string_storage, diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 6bab04e95de9e..72d672ba8a7d9 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -27,11 +26,10 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage == "python" and HAS_PYARROW: - # string storage with na_value=NaN always uses pyarrow if available - # -> does not yet honor the option - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) +def test_config(string_storage, using_infer_string): + # with the default string_storage setting + # always "python" at the moment + assert StringDtype().storage == "python" with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index e0232bb292d6e..ccd30caba5dee 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -802,13 +803,17 @@ def test_pandas_dtype_ea_not_instance(): def test_pandas_dtype_string_dtypes(string_storage): - # TODO(infer_string) remove skip if "python" is supported - pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype("str") + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + with pd.option_context("future.infer_string", True): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") - # TODO(infer_string) hardcoded to pyarrow until python is supported - assert result == pd.StringDtype("pyarrow", na_value=np.nan) + assert result == pd.StringDtype(string_storage, na_value=np.nan) with pd.option_context("future.infer_string", False): with pd.option_context("string_storage", string_storage): From ffd22c11bdc9b3b6f5891dc48f42b1c7fbf6c493 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 13 Aug 2024 15:46:03 -0700 Subject: [PATCH 218/396] BUG (string): ArrowEA comparisons with mismatched types (#59505) * BUG: ArrowEA comparisons with mismatched types * move whatsnew * GH ref --- pandas/core/arrays/arrow/array.py | 8 ++++++- pandas/core/arrays/string_arrow.py | 6 +---- pandas/tests/series/test_logical_ops.py | 31 +++++++++++++++++++++---- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 6c44b7759f0e2..46f2cbb2ebeef 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -704,7 +704,13 @@ def _cmp_method(self, other, op): if isinstance( other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): - result = pc_func(self._pa_array, self._box_pa(other)) + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except pa.ArrowNotImplementedError: + # TODO: could this be wrong if other is object dtype? + # in which case we need to operate pointwise? + result = ops.invalid_comparison(self, other, op) + result = pa.array(result, type=pa.bool_()) elif is_scalar(other): try: result = pc_func(self._pa_array, self._box_pa(other)) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7119f6321a7ff..00d92958cc8dc 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -37,7 +37,6 @@ BaseStringArray, StringDtype, ) -from pandas.core.ops import invalid_comparison from pandas.core.strings.object_array import ObjectStringArrayMixin if not pa_version_under10p1: @@ -565,10 +564,7 @@ def _convert_int_dtype(self, result): return result def _cmp_method(self, other, op): - try: - result = super()._cmp_method(other, op) - except pa.ArrowNotImplementedError: - return invalid_comparison(self, other, op) + result = super()._cmp_method(other, op) if op == operator.ne: return result.to_numpy(np.bool_, na_value=True) else: diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index d3a718be47c18..ff21427b71cf9 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -9,6 +9,7 @@ from pandas.compat import HAS_PYARROW from pandas import ( + ArrowDtype, DataFrame, Index, Series, @@ -539,18 +540,38 @@ def test_int_dtype_different_index_not_bool(self): result = ser1 ^ ser2 tm.assert_series_equal(result, expected) + # TODO: this belongs in comparison tests def test_pyarrow_numpy_string_invalid(self): # GH#56008 - pytest.importorskip("pyarrow") + pa = pytest.importorskip("pyarrow") ser = Series([False, True]) ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") result = ser == ser2 - expected = Series(False, index=ser.index) - tm.assert_series_equal(result, expected) + expected_eq = Series(False, index=ser.index) + tm.assert_series_equal(result, expected_eq) result = ser != ser2 - expected = Series(True, index=ser.index) - tm.assert_series_equal(result, expected) + expected_ne = Series(True, index=ser.index) + tm.assert_series_equal(result, expected_ne) with pytest.raises(TypeError, match="Invalid comparison"): ser > ser2 + + # GH#59505 + ser3 = ser2.astype("string[pyarrow]") + result3_eq = ser3 == ser + tm.assert_series_equal(result3_eq, expected_eq.astype("bool[pyarrow]")) + result3_ne = ser3 != ser + tm.assert_series_equal(result3_ne, expected_ne.astype("bool[pyarrow]")) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser3 + + ser4 = ser2.astype(ArrowDtype(pa.string())) + result4_eq = ser4 == ser + tm.assert_series_equal(result4_eq, expected_eq.astype("bool[pyarrow]")) + result4_ne = ser4 != ser + tm.assert_series_equal(result4_ne, expected_ne.astype("bool[pyarrow]")) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser4 From 86ce6c79fbc4a45c6e6de57e709ca1ddedefba37 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 14 Aug 2024 09:08:54 +0200 Subject: [PATCH 219/396] TST (string dtype): clean up construction of expected string arrays (#59481) --- pandas/tests/io/excel/test_readers.py | 48 +++++++------------ pandas/tests/io/json/test_pandas.py | 38 ++++----------- .../io/parser/dtypes/test_dtypes_basic.py | 30 ++++-------- pandas/tests/io/parser/test_read_fwf.py | 31 ++++-------- pandas/tests/io/test_clipboard.py | 30 ++++-------- pandas/tests/io/test_feather.py | 28 ++++------- pandas/tests/io/test_html.py | 36 ++++---------- pandas/tests/io/test_sql.py | 29 +++-------- pandas/tests/io/xml/test_xml.py | 37 +++----------- 9 files changed, 82 insertions(+), 225 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 1b79b4bff1cea..c899fd01ce7bb 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -30,10 +30,6 @@ read_csv, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) if is_platform_windows(): pytestmark = pytest.mark.single_cpu @@ -663,41 +659,31 @@ def test_dtype_backend_and_dtype(self, read_ext): @pytest.mark.xfail( using_string_dtype(), reason="infer_string takes precedence", strict=False ) - def test_dtype_backend_string(self, read_ext, string_storage): + def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") - pa = pytest.importorskip("pyarrow") + df = DataFrame( + { + "a": np.array(["a", "b"], dtype=np.object_), + "b": np.array(["x", pd.NA], dtype=np.object_), + } + ) + df.to_excel(tmp_excel, sheet_name="test", index=False) with pd.option_context("mode.string_storage", string_storage): - df = DataFrame( - { - "a": np.array(["a", "b"], dtype=np.object_), - "b": np.array(["x", pd.NA], dtype=np.object_), - } + result = pd.read_excel( + tmp_excel, sheet_name="test", dtype_backend="numpy_nullable" ) - with tm.ensure_clean(read_ext) as file_path: - df.to_excel(file_path, sheet_name="test", index=False) - result = pd.read_excel( - file_path, sheet_name="test", dtype_backend="numpy_nullable" - ) - if string_storage == "python": - expected = DataFrame( - { - "a": StringArray(np.array(["a", "b"], dtype=np.object_)), - "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), - } - ) - else: - expected = DataFrame( - { - "a": ArrowStringArray(pa.array(["a", "b"])), - "b": ArrowStringArray(pa.array(["x", None])), - } - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": Series(["x", None], dtype=pd.StringDtype(string_storage)), + } + ) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index cb94111aedffd..c0ecd4fd7cf59 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -31,11 +31,6 @@ read_json, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) -from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.json import ujson_dumps @@ -2037,14 +2032,10 @@ def test_json_uint64(self): assert result == expected @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize( - "orient", ["split", "records", "values", "index", "columns"] - ) def test_read_json_dtype_backend( self, string_storage, dtype_backend, orient, using_infer_string ): # GH#50750 - pa = pytest.importorskip("pyarrow") df = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2058,30 +2049,18 @@ def test_read_json_dtype_backend( } ) - if using_infer_string: - string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None])) - elif string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - - else: - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - out = df.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): result = read_json( StringIO(out), dtype_backend=dtype_backend, orient=orient ) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2090,12 +2069,13 @@ def test_read_json_dtype_backend( "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") from pandas.arrays import ArrowExtensionArray expected = DataFrame( diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 5138dfb61eaac..e072e97637f2a 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -18,11 +18,7 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - IntegerArray, - StringArray, -) +from pandas.core.arrays import IntegerArray pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" @@ -465,8 +461,6 @@ def test_dtype_backend_and_dtype(all_parsers): def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 - pa = pytest.importorskip("pyarrow") - with pd.option_context("mode.string_storage", string_storage): parser = all_parsers @@ -476,21 +470,13 @@ def test_dtype_backend_string(all_parsers, string_storage): """ result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable") - if string_storage == "python": - expected = DataFrame( - { - "a": StringArray(np.array(["a", "b"], dtype=np.object_)), - "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), - } - ) - else: - expected = DataFrame( - { - "a": ArrowStringArray(pa.array(["a", "b"])), - "b": ArrowStringArray(pa.array(["x", None])), - } - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + { + "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)), + } + ) + tm.assert_frame_equal(result, expected) def test_dtype_backend_ea_dtype_specified(all_parsers): diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 53426bebaa70b..72a341d8487e7 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -14,8 +14,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import EmptyDataError import pandas as pd @@ -24,10 +22,6 @@ DatetimeIndex, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.common import urlopen from pandas.io.parsers import ( @@ -968,39 +962,30 @@ def test_widths_and_usecols(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(string_storage, dtype_backend): # GH#50289 - if string_storage == "python": - arr = StringArray(np.array(["a", "b"], dtype=np.object_)) - arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - arr = ArrowExtensionArray(pa.array(["a", "b"])) - arr_na = ArrowExtensionArray(pa.array([None, "a"])) - else: - pa = pytest.importorskip("pyarrow") - arr = ArrowStringArray(pa.array(["a", "b"])) - arr_na = ArrowStringArray(pa.array([None, "a"])) - data = """a b c d e f g h i 1 2.5 True a 3 4.5 False b True 6 7.5 a""" with pd.option_context("mode.string_storage", string_storage): result = read_fwf(StringIO(data), dtype_backend=dtype_backend) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": pd.Series([1, 3], dtype="Int64"), "b": pd.Series([2.5, 4.5], dtype="Float64"), "c": pd.Series([True, False], dtype="boolean"), - "d": arr, + "d": pd.Series(["a", "b"], dtype=string_dtype), "e": pd.Series([pd.NA, True], dtype="boolean"), "f": pd.Series([pd.NA, 6], dtype="Int64"), "g": pd.Series([pd.NA, 7.5], dtype="Float64"), - "h": arr_na, + "h": pd.Series([None, "a"], dtype=string_dtype), "i": pd.Series([pd.NA, pd.NA], dtype="Int64"), } ) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index da998f058471c..3a52ff5acc0b3 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -19,10 +19,6 @@ read_clipboard, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.clipboard import ( CheckedCall, @@ -358,23 +354,15 @@ def test_read_clipboard_dtype_backend( self, clipboard, string_storage, dtype_backend, engine ): # GH#50502 - if string_storage == "pyarrow" or dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - - if string_storage == "python": - string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) - string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow" and engine != "c": + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["x", "y"])) - string_array_na = ArrowExtensionArray(pa.array(["x", None])) - + if engine == "c" and string_storage == "pyarrow": + # TODO avoid this exception? + string_dtype = pd.ArrowDtype(pa.large_string()) + else: + string_dtype = pd.ArrowDtype(pa.string()) else: - string_array = ArrowStringArray(pa.array(["x", "y"])) - string_array_na = ArrowStringArray(pa.array(["x", None])) + string_dtype = pd.StringDtype(string_storage) text = """a,b,c,d,e,f,g,h,i x,1,4.0,x,2,4.0,,True,False @@ -386,10 +374,10 @@ def test_read_clipboard_dtype_backend( expected = DataFrame( { - "a": string_array, + "a": Series(["x", "y"], dtype=string_dtype), "b": Series([1, 2], dtype="Int64"), "c": Series([4.0, 5.0], dtype="Float64"), - "d": string_array_na, + "d": Series(["x", None], dtype=string_dtype), "e": Series([2, NA], dtype="Int64"), "f": Series([4.0, NA], dtype="Float64"), "g": Series([NA, NA], dtype="Int64"), diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index d169fab3f1832..d1201686edefa 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -6,10 +6,6 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.feather_format import read_feather, to_feather # isort:skip @@ -188,25 +184,17 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): } ) - if string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - - else: - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - with tm.ensure_clean() as path: to_feather(df, path) with pd.option_context("mode.string_storage", string_storage): result = read_feather(path, dtype_backend=dtype_backend) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = pd.DataFrame( { "a": pd.Series([1, np.nan, 3], dtype="Int64"), @@ -215,8 +203,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"), "e": pd.Series([True, False, pd.NA], dtype="boolean"), "f": pd.Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": pd.Series(["a", "b", "c"], dtype=string_dtype), + "h": pd.Series(["a", "b", None], dtype=string_dtype), } ) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 298b9115b51e4..56c9cf8df0e89 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -13,8 +13,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -31,17 +29,9 @@ to_datetime, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.common import file_path_to_url -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture( params=[ @@ -160,7 +150,7 @@ def test_to_html_compat(self, flavor_read_html): df = ( DataFrame( np.random.default_rng(2).random((4, 3)), - columns=pd.Index(list("abc"), dtype=object), + columns=pd.Index(list("abc")), ) # pylint: disable-next=consider-using-f-string .map("{:.3f}".format).astype(float) @@ -186,24 +176,16 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): } ) - if string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) - elif dtype_backend == "pyarrow": - pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0] + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -212,8 +194,8 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 792c532fa8032..b1557d71f15e4 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -42,10 +42,6 @@ to_timedelta, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.util.version import Version from pandas.io import sql @@ -3678,24 +3674,13 @@ def dtype_backend_data() -> DataFrame: @pytest.fixture def dtype_backend_expected(): - def func(storage, dtype_backend, conn_name) -> DataFrame: - string_array: StringArray | ArrowStringArray - string_array_na: StringArray | ArrowStringArray - if storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": + def func(string_storage, dtype_backend, conn_name) -> DataFrame: + string_dtype: pd.StringDtype | pd.ArrowDtype + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment] - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment] - + string_dtype = pd.ArrowDtype(pa.string()) else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) + string_dtype = pd.StringDtype(string_storage) df = DataFrame( { @@ -3705,8 +3690,8 @@ def func(storage, dtype_backend, conn_name) -> DataFrame: "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, pd.NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": Series(["a", "b", "c"], dtype=string_dtype), + "h": Series(["a", "b", None], dtype=string_dtype), } ) if dtype_backend == "pyarrow": diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 35beda37acf51..04c040efa767c 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,8 +14,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat._optional import import_optional_dependency from pandas.errors import ( EmptyDataError, @@ -30,11 +28,6 @@ Series, ) import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) -from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -2007,7 +2000,6 @@ def test_s3_parser_consistency(s3_public_bucket_with_data, s3so): tm.assert_frame_equal(df_lxml, df_etree) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_xml_nullable_dtypes( parser, string_storage, dtype_backend, using_infer_string ): @@ -2038,36 +2030,21 @@ def test_read_xml_nullable_dtypes( """ - if using_infer_string: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"])) - string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None])) - - elif string_storage == "python": - string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) - string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + with pd.option_context("mode.string_storage", string_storage): + result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) - elif dtype_backend == "pyarrow": + if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["x", "y"])) - string_array_na = ArrowExtensionArray(pa.array(["x", None])) - + string_dtype = pd.ArrowDtype(pa.string()) else: - pa = pytest.importorskip("pyarrow") - string_array = ArrowStringArray(pa.array(["x", "y"])) - string_array_na = ArrowStringArray(pa.array(["x", None])) - - with pd.option_context("mode.string_storage", string_storage): - result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) + string_dtype = pd.StringDtype(string_storage) expected = DataFrame( { - "a": string_array, + "a": Series(["x", "y"], dtype=string_dtype), "b": Series([1, 2], dtype="Int64"), "c": Series([4.0, 5.0], dtype="Float64"), - "d": string_array_na, + "d": Series(["x", None], dtype=string_dtype), "e": Series([2, NA], dtype="Int64"), "f": Series([4.0, NA], dtype="Float64"), "g": Series([NA, NA], dtype="Int64"), From 9f3526fdfeb54d313e312581d999be8d36586097 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 22 Aug 2024 15:07:41 -0400 Subject: [PATCH 220/396] TST (string dtype): clean up construction of expected string arrays (#59481) --- pandas/tests/io/excel/test_readers.py | 39 ++++++++++++++------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c899fd01ce7bb..8dc76d8f747cb 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -659,31 +659,34 @@ def test_dtype_backend_and_dtype(self, read_ext): @pytest.mark.xfail( using_string_dtype(), reason="infer_string takes precedence", strict=False ) - def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel): + def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") - df = DataFrame( - { - "a": np.array(["a", "b"], dtype=np.object_), - "b": np.array(["x", pd.NA], dtype=np.object_), - } - ) - df.to_excel(tmp_excel, sheet_name="test", index=False) - with pd.option_context("mode.string_storage", string_storage): - result = pd.read_excel( - tmp_excel, sheet_name="test", dtype_backend="numpy_nullable" + df = DataFrame( + { + "a": np.array(["a", "b"], dtype=np.object_), + "b": np.array(["x", pd.NA], dtype=np.object_), + } ) - expected = DataFrame( - { - "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)), - "b": Series(["x", None], dtype=pd.StringDtype(string_storage)), - } - ) - tm.assert_frame_equal(result, expected) + with tm.ensure_clean(read_ext) as file_path: + df.to_excel(file_path, sheet_name="test", index=False) + result = pd.read_excel( + file_path, sheet_name="test", dtype_backend="numpy_nullable" + ) + + expected = DataFrame( + { + "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": Series(["x", None], dtype=pd.StringDtype(string_storage)), + } + ) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): From 6239d5d7c976f8cb91985fd9b712a26370936d4f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 22 Aug 2024 15:08:52 -0400 Subject: [PATCH 221/396] TST (string dtype): fix IO dtype_backend tests for storage of str dtype of columns' Index (#59509) --- pandas/tests/io/json/test_pandas.py | 4 +++- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 12 ++++++------ pandas/tests/io/parser/test_read_fwf.py | 4 +++- pandas/tests/io/test_html.py | 4 +++- pandas/tests/io/xml/test_xml.py | 4 +++- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c0ecd4fd7cf59..de40441fe25dd 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2088,7 +2088,9 @@ def test_read_json_dtype_backend( if orient == "values": expected.columns = list(range(8)) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.parametrize("orient", ["split", "records", "index"]) def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index e072e97637f2a..800ece5a409e1 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -470,12 +470,12 @@ def test_dtype_backend_string(all_parsers, string_storage): """ result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable") - expected = DataFrame( - { - "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)), - "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)), - } - ) + expected = DataFrame( + { + "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)), + "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)), + }, + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 72a341d8487e7..d8fe168341ff1 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -1001,7 +1001,9 @@ def test_dtype_backend(string_storage, dtype_backend): ) expected["i"] = ArrowExtensionArray(pa.array([None, None])) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) def test_invalid_dtype_backend(): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 56c9cf8df0e89..826c0a1ca7cf9 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -211,7 +211,9 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): } ) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.network @pytest.mark.single_cpu diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 04c040efa767c..92e89ddbc8e80 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -2065,7 +2065,9 @@ def test_read_xml_nullable_dtypes( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) - tm.assert_frame_equal(result, expected) + # the storage of the str columns' Index is also affected by the + # string_storage setting -> ignore that for checking the result + tm.assert_frame_equal(result, expected, check_column_type=False) def test_invalid_dtype_backend(): From b1ee91fd9be6551c28ff4d7321cb9459c8ab88ae Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 14 Aug 2024 12:44:56 -0700 Subject: [PATCH 222/396] REF (string): Move StringArrayNumpySemantics methods to base class (#59514) * REF (string): Move StringArrayNumpySemantics methods to base class * mypy fixup --- pandas/core/arrays/string_.py | 55 +++++++++++++++-------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c881437ba25af..f3e5e6fe5f3da 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -738,11 +738,23 @@ def astype(self, dtype, copy: bool = True): def _reduce( self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs ): + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if name == "any": + return nanops.nanany(self._ndarray, skipna=skipna) + else: + return nanops.nanall(self._ndarray, skipna=skipna) + if name in ["min", "max"]: return getattr(self, name)(skipna=skipna, axis=axis) raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: + if self.dtype.na_value is np.nan and result is libmissing.NA: + # the masked_reductions use pd.NA -> convert to np.nan + return np.nan + return super()._wrap_reduction_result(axis, result) + def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) result = masked_reductions.min( @@ -761,7 +773,11 @@ def value_counts(self, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts result = value_counts(self._ndarray, dropna=dropna).astype("Int64") + result = value_counts(self._ndarray, sort=False, dropna=dropna) result.index = result.index.astype(self.dtype) + + if self.dtype.na_value is libmissing.NA: + result = result.astype("Int64") return result def memory_usage(self, deep: bool = False) -> int: @@ -812,7 +828,13 @@ def _cmp_method(self, other, op): # logical result = np.zeros(len(self._ndarray), dtype="bool") result[valid] = op(self._ndarray[valid], other) - return BooleanArray(result, mask) + res_arr = BooleanArray(result, mask) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return res_arr.to_numpy(np.bool_, na_value=True) + else: + return res_arr.to_numpy(np.bool_, na_value=False) + return res_arr _arith_method = _cmp_method @@ -853,37 +875,6 @@ def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: # we always preserve the dtype return NDArrayBacked._from_backing_data(self, arr) - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if name == "any": - return nanops.nanany(self._ndarray, skipna=skipna) - else: - return nanops.nanall(self._ndarray, skipna=skipna) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) - - def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: - # the masked_reductions use pd.NA - if result is libmissing.NA: - return np.nan - return super()._wrap_reduction_result(axis, result) - - def _cmp_method(self, other, op): - result = super()._cmp_method(other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas.core.algorithms import value_counts_internal as value_counts - - result = value_counts(self._ndarray, sort=False, dropna=dropna) - result.index = result.index.astype(self.dtype) - return result - # ------------------------------------------------------------------------ # String methods interface _str_na_value = np.nan From e61c628f959123558c116de9cc0b5b590e8464be Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 15 Aug 2024 10:45:12 -0700 Subject: [PATCH 223/396] REF (string): remove _str_na_value (#59515) * REF (string): remove _str_na_value * mypy fixup --- pandas/core/arrays/numpy_.py | 4 ---- pandas/core/arrays/string_.py | 10 ---------- pandas/core/arrays/string_arrow.py | 4 ---- pandas/core/strings/object_array.py | 10 ++++------ 4 files changed, 4 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 07eb91e0cb13b..03712f75db0c7 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -557,7 +557,3 @@ def _wrap_ndarray_result(self, result: np.ndarray): return TimedeltaArray._simple_new(result, dtype=result.dtype) return type(self)(result) - - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = np.nan diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f3e5e6fe5f3da..a64be0b197494 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -838,12 +838,6 @@ def _cmp_method(self, other, op): _arith_method = _cmp_method - # ------------------------------------------------------------------------ - # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "NumpyExtensionArray" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - class StringArrayNumpySemantics(StringArray): _storage = "python" @@ -874,7 +868,3 @@ def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: # need to override NumpyExtensionArray._from_backing_data to ensure # we always preserve the dtype return NDArrayBacked._from_backing_data(self, arr) - - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = np.nan diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 00d92958cc8dc..136411ad1a83d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -279,10 +279,6 @@ def _data(self): # ------------------------------------------------------------------------ # String methods interface - # error: Incompatible types in assignment (expression has type "NAType", - # base class "ObjectStringArrayMixin" defined the type as "float") - _str_na_value = libmissing.NA # type: ignore[assignment] - _str_map = BaseStringArray._str_map def _str_contains( diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 0029beccc40a8..090e27ec58cc3 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -37,8 +37,6 @@ class ObjectStringArrayMixin(BaseStringArrayMethods): String Methods operating on object-dtype ndarrays. """ - _str_na_value = np.nan - def __len__(self) -> int: # For typing, _str_map relies on the object being sized. raise NotImplementedError @@ -56,7 +54,7 @@ def _str_map( na_value : Scalar, optional The value to set for NA values. Might also be used for the fill value if the callable `f` raises an exception. - This defaults to ``self._str_na_value`` which is ``np.nan`` + This defaults to ``self.dtype.na_value`` which is ``np.nan`` for object-dtype and Categorical and ``pd.NA`` for StringArray. dtype : Dtype, optional The dtype of the result array. @@ -66,7 +64,7 @@ def _str_map( if dtype is None: dtype = np.dtype("object") if na_value is None: - na_value = self._str_na_value + na_value = self.dtype.na_value # type: ignore[attr-defined] if not len(self): return np.array([], dtype=dtype) @@ -270,7 +268,7 @@ def f(x): return x.get(i) elif len(x) > i >= -len(x): return x[i] - return self._str_na_value + return self.dtype.na_value # type: ignore[attr-defined] return self._str_map(f) @@ -473,7 +471,7 @@ def _str_removesuffix(self, suffix: str) -> Series: def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): regex = re.compile(pat, flags=flags) - na_value = self._str_na_value + na_value = self.dtype.na_value # type: ignore[attr-defined] if not expand: From be6354b2ff4dfcfa7b800e79293070f5182f0939 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 15 Aug 2024 12:01:21 -0700 Subject: [PATCH 224/396] REF (string): move ArrowStringArrayNumpySemantics methods to base class (#59501) * REF: move ArrowStringArrayNumpySemantics methods to parent class * REF: move methods to ArrowStringArray * mypy fixup * Fix incorrect double-unpacking * move methods to subclass --- pandas/core/arrays/string_arrow.py | 109 +++++++++++++---------------- 1 file changed, 48 insertions(+), 61 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 136411ad1a83d..91c1f20ba93c6 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,5 @@ from __future__ import annotations -from functools import partial import operator import re from typing import ( @@ -209,12 +208,17 @@ def dtype(self) -> StringDtype: # type: ignore[override] return self._dtype def insert(self, loc: int, item) -> ArrowStringArray: + if self.dtype.na_value is np.nan and item is np.nan: + item = libmissing.NA if not isinstance(item, str) and item is not libmissing.NA: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - @classmethod - def _result_converter(cls, values, na=None): + def _result_converter(self, values, na=None): + if self.dtype.na_value is np.nan: + if not isna(na): + values = values.fill_null(bool(na)) + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -494,11 +498,30 @@ def _str_get_dummies(self, sep: str = "|"): return dummies.astype(np.int64, copy=False), labels def _convert_int_dtype(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + if result.dtype == np.int32: + result = result.astype(np.int64) + return result + return Int64Dtype().__from_arrow__(result) def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if not skipna: + nas = pc.is_null(self._pa_array) + arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") + return ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) if name in ("argmin", "argmax") and isinstance(result, pa.Array): return self._convert_int_dtype(result) @@ -529,67 +552,31 @@ def _rank( ) ) - -class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow" - _na_value = np.nan - - @classmethod - def _result_converter(cls, values, na=None): - if not isna(na): - values = values.fill_null(bool(na)) - return ArrowExtensionArray(values).to_numpy(na_value=np.nan) - - def __getattribute__(self, item): - # ArrowStringArray and we both inherit from ArrowExtensionArray, which - # creates inheritance problems (Diamond inheritance) - if item in ArrowStringArrayMixin.__dict__ and item not in ( - "_pa_array", - "__dict__", - ): - return partial(getattr(ArrowStringArrayMixin, item), self) - return super().__getattribute__(item) - - def _convert_int_dtype(self, result): - if isinstance(result, pa.Array): - result = result.to_numpy(zero_copy_only=False) - else: - result = result.to_numpy() - if result.dtype == np.int32: - result = result.astype(np.int64) + def value_counts(self, dropna: bool = True) -> Series: + result = super().value_counts(dropna=dropna) + if self.dtype.na_value is np.nan: + res_values = result._values.to_numpy() + return result._constructor( + res_values, index=result.index, name=result.name, copy=False + ) return result def _cmp_method(self, other, op): result = super()._cmp_method(other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas import Series - - result = super().value_counts(dropna) - return Series( - result._values.to_numpy(), index=result.index, name=result.name, copy=False - ) - - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if not skipna and name == "all": - nas = pc.invert(pc.is_null(self._pa_array)) - arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, "")) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) else: - arr = pc.not_equal(self._pa_array, "") - return ArrowExtensionArray(arr)._reduce( - name, skipna=skipna, keepdims=keepdims, **kwargs - ) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + return result.to_numpy(np.bool_, na_value=False) + return result - def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: - if item is np.nan: - item = libmissing.NA - return super().insert(loc, item) # type: ignore[return-value] + +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _na_value = np.nan + _str_get = ArrowStringArrayMixin._str_get + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_pad = ArrowStringArrayMixin._str_pad + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace From 3bb9ae624dda7ddf86a24222529cb16a695a7725 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 16 Aug 2024 10:29:26 -0700 Subject: [PATCH 225/396] API (string): return str dtype for .dt methods, DatetimeIndex methods (#59526) * API (string): return str dtype for .dt methods, DatetimeIndex methods * mypy fixup --- pandas/core/arrays/datetimelike.py | 6 +++++ pandas/core/arrays/datetimes.py | 17 +++++++++++++ pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/extension.py | 4 ++-- pandas/tests/arrays/test_datetimelike.py | 24 ++++++++++++------- .../series/accessors/test_dt_accessor.py | 8 +++---- 6 files changed, 45 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1042a1b3fde61..e85c0222bbec3 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -20,6 +20,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( algos, lib, @@ -1789,6 +1791,10 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: dtype='object') """ result = self._format_native_types(date_format=date_format, na_rep=np.nan) + if using_string_dtype(): + from pandas import StringDtype + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result.astype(object, copy=False) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index a146220d249e2..0db25db02e75a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -14,6 +14,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( lib, tslib, @@ -1306,6 +1308,13 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: values, "month_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result def day_name(self, locale=None) -> npt.NDArray[np.object_]: @@ -1363,6 +1372,14 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: values, "day_name", locale=locale, reso=self._creso ) result = self._maybe_mask_results(result, fill_value=None) + if using_string_dtype(): + # TODO: no tests that check for dtype of result as of 2024-08-15 + from pandas import ( + StringDtype, + array as pd_array, + ) + + return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value] return result @property diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index c978abd8c2427..3204a9c97ee73 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -276,7 +276,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: @doc(DatetimeArray.strftime) def strftime(self, date_format) -> Index: arr = self._data.strftime(date_format) - return Index(arr, name=self.name, dtype=object) + return Index(arr, name=self.name, dtype=arr.dtype) @doc(DatetimeArray.tz_convert) def tz_convert(self, tz) -> Self: diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 61949531f37df..371d3c811e772 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -71,7 +71,7 @@ def fget(self): return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result def fset(self, value) -> None: @@ -98,7 +98,7 @@ def method(self, *args, **kwargs): # type: ignore[misc] return type(self)._simple_new(result, name=self.name) elif isinstance(result, ABCDataFrame): return result.set_index(self) - return Index(result, name=self.name) + return Index(result, name=self.name, dtype=result.dtype) return result # error: "property" has no attribute "__name__" diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 360ab960088ed..b346294a892d3 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -889,20 +889,24 @@ def test_concat_same_type_different_freq(self, unit): tm.assert_datetime_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y %b") expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = DatetimeIndex(["2019-01-01", NaT])._data result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) class TestTimedeltaArray(SharedTests): @@ -1159,20 +1163,24 @@ def test_array_interface(self, arr1d): expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) - def test_strftime(self, arr1d): + def test_strftime(self, arr1d, using_infer_string): arr = arr1d result = arr.strftime("%Y") expected = np.array([per.strftime("%Y") for per in arr], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) - def test_strftime_nat(self): + def test_strftime_nat(self, using_infer_string): # GH 29578 arr = PeriodArray(PeriodIndex(["2019-01-01", NaT], dtype="period[D]")) result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string: + expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan)) + tm.assert_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 0dd2c227d6aa7..18ee81581bdc3 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -29,6 +29,7 @@ Period, PeriodIndex, Series, + StringDtype, TimedeltaIndex, date_range, period_range, @@ -528,7 +529,6 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ser = pd.concat([ser, Series([pd.NaT])]) assert np.isnan(ser.dt.month_name(locale=time_locale).iloc[-1]) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime(self): # GH 10086 ser = Series(date_range("20130101", periods=5)) @@ -599,10 +599,9 @@ def test_strftime_period_days(self, using_infer_string): dtype="=U10", ) if using_infer_string: - expected = expected.astype("str") + expected = expected.astype(StringDtype(na_value=np.nan)) tm.assert_index_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_microsecond_resolution(self): ser = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) result = ser.dt.strftime("%Y-%m-%d %H:%M:%S") @@ -635,7 +634,6 @@ def test_strftime_period_minutes(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data", [ @@ -658,7 +656,7 @@ def test_strftime_all_nat(self, data): ser = Series(data) with tm.assert_produces_warning(None): result = ser.dt.strftime("%Y-%m-%d") - expected = Series([np.nan], dtype=object) + expected = Series([np.nan], dtype="str") tm.assert_series_equal(result, expected) def test_valid_dt_with_missing_values(self): From c2259ad28ef794e4b1d849dbf938d8bda6fc56db Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Aug 2024 14:23:07 -0400 Subject: [PATCH 226/396] Pick required fix from 2542674ee9 #56709 --- pandas/tests/computation/test_eval.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index e8fad6b8cbd63..cf3e50094ac97 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -606,11 +606,10 @@ def test_unary_in_array(self): ) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("expr", ["x < -0.1", "-5 > x"]) - def test_float_comparison_bin_op(self, dtype, expr): + def test_float_comparison_bin_op(self, float_numpy_dtype, expr): # GH 16363 - df = DataFrame({"x": np.array([0], dtype=dtype)}) + df = DataFrame({"x": np.array([0], dtype=float_numpy_dtype)}) res = df.eval(expr) assert res.values == np.array([False]) From 41bd34fc8e65c84fb3f2714223baf0361151e7bf Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 21 Aug 2024 21:59:09 -0400 Subject: [PATCH 227/396] Pick required fix from f4232e7 #58006 --- pandas/tests/frame/test_reductions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index a64f06f6c0521..a4263279a7bd5 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1999,7 +1999,9 @@ def test_minmax_extensionarray(method, numeric_only): def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) - with pytest.raises(TypeError, match="does not support operation|Cannot perform"): + with pytest.raises( + TypeError, match="does not support (operation|reduction)|Cannot perform" + ): df.sum() From b488b2ab286f142c708766c5f105815c78f39c86 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 20 Sep 2024 13:56:11 -0400 Subject: [PATCH 228/396] Pick required fix from #55901 and #59581 --- pandas/tests/io/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 72efe989804e4..ae8889faaf1c9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1314,6 +1314,7 @@ def test_empty_dataframe(self, fp): _HAVE_FASTPARQUET and Version(fastparquet.__version__) > Version("2022.12"), reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929", ) + @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") def test_timezone_aware_index(self, fp, timezone_aware_date_list): idx = 5 * [timezone_aware_date_list] From 99e98e5ed783cad2dc8897e7ed9199632bc985ab Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 22 Aug 2024 14:47:08 -0400 Subject: [PATCH 229/396] Remove .pre-commit check for pytest ref #56671 --- .pre-commit-config.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4b02ad7cf886f..9b3a9827e67e2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -274,13 +274,6 @@ repos: language: python types: [rst] files: ^doc/source/(development|reference)/ - - id: unwanted-patterns-bare-pytest-raises - name: Check for use of bare pytest raises - language: python - entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises" - types: [python] - files: ^pandas/tests/ - exclude: ^pandas/tests/extension/ - id: unwanted-patterns-private-function-across-module name: Check for use of private functions across modules language: python From 7edc8d75d8af5b4fba157d9df79cf83643540500 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 22 Aug 2024 15:48:34 -0400 Subject: [PATCH 230/396] Skip niche issue --- pandas/tests/strings/test_find_replace.py | 24 +++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 2d7c9754ee319..78ce1d7418886 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -233,14 +233,22 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = s.str.contains("foo", na="foo") - if any_string_dtype == "object": - expected = Series(["foo", "foo", "foo"], dtype=np.object_) - elif any_string_dtype.na_value is np.nan: - expected = Series([True, True, True], dtype=np.bool_) - else: - expected = Series([True, True, True], dtype="boolean") - tm.assert_series_equal(result, expected) + # this particular combination of events is broken on 2.3 + # would require cherry picking #58483, which in turn requires #57481 + # which introduce many behavioral changes + if not ( + hasattr(any_string_dtype, "storage") + and any_string_dtype.storage == "python" + and any_string_dtype.na_value is np.nan + ): + result = s.str.contains("foo", na="foo") + if any_string_dtype == "object": + expected = Series(["foo", "foo", "foo"], dtype=np.object_) + elif any_string_dtype.na_value is np.nan: + expected = Series([True, True, True], dtype=np.bool_) + else: + expected = Series([True, True, True], dtype="boolean") + tm.assert_series_equal(result, expected) result = s.str.contains("foo") expected_dtype = ( From 24bff566bcd69853e31fc26f19739d691edefd5a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Aug 2024 15:17:38 -0400 Subject: [PATCH 231/396] Add required skip from #58467 --- pandas/tests/io/test_parquet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ae8889faaf1c9..59662ec77d52f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -973,6 +973,8 @@ def test_timestamp_nanoseconds(self, pa): check_round_trip(df, pa, write_kwargs={"version": ver}) def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): + pytest.importorskip("pyarrow", "11.0.0") + if timezone_aware_date_list.tzinfo != datetime.timezone.utc: request.applymarker( pytest.mark.xfail( From 75b551f5de3894d21717d879390bbfd3e2b03294 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 20 Sep 2024 13:52:24 -0400 Subject: [PATCH 232/396] Remove tests that will fail without backport of #58437 --- pandas/tests/frame/test_query_eval.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 13232a0909c5b..7dde0683aa960 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -190,6 +190,25 @@ def test_eval_object_dtype_binop(self): expected = DataFrame({"a1": ["Y", "N"], "c": [True, False]}) tm.assert_frame_equal(res, expected) + def test_extension_array_eval(self, engine, parser, request): + # GH#58748 + if engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr does not support extension array dtypes" + ) + request.applymarker(mark) + df = DataFrame({"a": pd.array([1, 2, 3]), "b": pd.array([4, 5, 6])}) + result = df.eval("a / b", engine=engine, parser=parser) + expected = Series(pd.array([0.25, 0.40, 0.50])) + tm.assert_series_equal(result, expected) + + def test_complex_eval(self, engine, parser): + # GH#21374 + df = DataFrame({"a": [1 + 2j], "b": [1 + 1j]}) + result = df.eval("a/b", engine=engine, parser=parser) + expected = Series([1.5 + 0.5j]) + tm.assert_series_equal(result, expected) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): From 5e27da4aa3498697ec629191f80feacbeb0dd4be Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Sep 2024 23:32:38 +0200 Subject: [PATCH 233/396] additional test fixes (for tests that changed or no longer exist on main) --- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/frame/methods/test_align.py | 3 +++ pandas/tests/frame/methods/test_quantile.py | 3 --- pandas/tests/frame/methods/test_to_csv.py | 1 + pandas/tests/frame/methods/test_update.py | 2 +- pandas/tests/groupby/test_apply.py | 11 +++++++---- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_raises.py | 2 +- pandas/tests/indexing/test_coercion.py | 2 +- pandas/tests/indexing/test_indexing.py | 15 ++++++++++++--- .../io/parser/common/test_file_buffer_url.py | 4 ++-- pandas/tests/io/test_common.py | 4 +++- pandas/tests/io/test_pickle.py | 10 ++++++++-- pandas/tests/io/test_stata.py | 4 ++-- pandas/tests/tools/test_to_datetime.py | 6 ++---- 15 files changed, 45 insertions(+), 26 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 795e6b974ca34..2ab518405503c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -519,7 +519,7 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_setitem_None(self, float_frame, using_infer_string): # GH #766 float_frame[None] = float_frame["A"] diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 5a9c47866dae8..15a97a99caa5a 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -155,6 +157,7 @@ def test_align_series_condition(self): expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_align_int(self, int_frame): # test other non-float types other = DataFrame(index=range(5), columns=["A", "B", "C"]) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index ec070467b242e..15af2a14a042e 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -354,7 +354,6 @@ def test_quantile_multi_empty(self, interp_method): ) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_datetime(self, unit): dti = pd.to_datetime(["2010", "2011"]).as_unit(unit) df = DataFrame({"a": dti, "b": [0, 5]}) @@ -408,7 +407,6 @@ def test_quantile_datetime(self, unit): expected = DataFrame(index=[0.5], columns=[]) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [ @@ -679,7 +677,6 @@ def test_quantile_nat(self, interp_method, request, using_array_manager, unit): ) tm.assert_frame_equal(res, exp) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_empty_no_rows_floats(self, interp_method): interpolation, method = interp_method diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 20a8e95f990ec..4a65c3929944b 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -35,6 +35,7 @@ def read_csv(self, path, **kwargs): return read_csv(path, **params) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_from_csv1(self, float_frame, datetime_frame): with tm.ensure_clean("__tmp_to_csv_from_csv1__") as path: float_frame.iloc[:5, float_frame.columns.get_loc("A")] = np.nan diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 8af1798aa8e00..56700ab6bd1f7 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -169,7 +169,7 @@ def test_update_with_different_dtype(self, using_copy_on_write): { "a": [1, 3], "b": [np.nan, 2], - "c": Series(["foo", np.nan], dtype="object"), + "c": Series(["foo", np.nan]), } ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index d91510d834e6c..cc736f2bf53ba 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -126,7 +128,7 @@ def test_apply_trivial(using_infer_string): {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" @@ -143,7 +145,7 @@ def test_apply_trivial_fail(using_infer_string): {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - dtype = "string" if using_infer_string else "object" + dtype = "str" if using_infer_string else "object" expected = pd.concat([df, df], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -1299,12 +1301,13 @@ def test_apply_dropna_with_indexed_same(dropna): @pytest.mark.parametrize( "as_index, expected", [ - [ + pytest.param( False, DataFrame( [[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object) ), - ], + marks=pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)"), + ), [ True, Series( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 10f45cac1ff66..015a9db32883b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1005,7 +1005,7 @@ def test_raises_on_nuisance(df): depr_msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=depr_msg): grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = "does not support reduction 'sum'" + msg = "does not support reduction 'sum'|Cannot perform reduction 'sum'" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 64780d0ba03d8..d5b7a3f25d0eb 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -219,7 +219,7 @@ def func(x): getattr(gb, how)(func) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index c0a62ecb06f56..ac3bfe3a13a44 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -834,6 +834,7 @@ def replacer(self, how, from_key, to_key): def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = obj.astype(from_key) assert obj.dtype == from_key if from_key.startswith("datetime") and to_key.startswith("datetime"): @@ -854,7 +855,6 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key msg = "Downcasting behavior in `replace`" warn = FutureWarning diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 908e95accfb0f..e57598cfc2be1 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -687,7 +687,7 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - def test_rhs_alignment(self): + def test_rhs_alignment(self, using_infer_string): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases def run_tests(df, rhs, right_loc, right_iloc): @@ -731,8 +731,17 @@ def run_tests(df, rhs, right_loc, right_iloc): frame["jolie"] = frame["jolie"].map(lambda x: f"@{x}") right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0] right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - run_tests(df, rhs, right_loc, right_iloc) + if using_infer_string: + with pytest.raises( + TypeError, match="Must provide strings|Scalar must be NA or str" + ): + with tm.assert_produces_warning( + FutureWarning, match="incompatible dtype" + ): + run_tests(df, rhs, right_loc, right_iloc) + else: + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + run_tests(df, rhs, right_loc, right_iloc) @pytest.mark.parametrize( "idx", [_mklbl("A", 20), np.arange(20) + 100, np.linspace(100, 150, 20)] diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index 1f5021c8a24cc..c13b77f365496 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -87,8 +87,8 @@ def test_path_local_path(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_localpath( df.to_csv, lambda p: parser.read_csv(p, index_col=0) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 75ecd1d929d58..d38f716cf6a98 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -308,10 +308,12 @@ def test_read_expands_user_home_dir( "pyarrow", ("io", "data", "feather", "feather-0_3_1.feather"), ), - ( + pytest.param( pd.read_hdf, "tables", ("io", "data", "legacy_hdf", "datetimetz_object.h5"), + # cleaned-up in https://github.com/pandas-dev/pandas/pull/57387 on main + marks=pytest.mark.xfail(reason="TODO(infer_string)", strict=False), ), (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")), (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")), diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 4f3993a038197..05f4a20ee42d8 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -413,10 +413,16 @@ def test_read(self, protocol, get_random_path): @pytest.mark.parametrize( ["pickle_file", "excols"], [ - ("test_py27.pkl", Index(["a", "b", "c"])), + ("test_py27.pkl", Index(["a", "b", "c"], dtype=object)), ( "test_mi_py27.pkl", - pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]), + pd.MultiIndex( + [ + Index(["a", "b", "c"], dtype=object), + Index(["A", "B", "C"], dtype=object), + ], + [np.array([0, 1, 2]), np.array([0, 1, 2])], + ), ), ], ) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 3c5e843e2e97b..09509fb495034 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1563,8 +1563,8 @@ def test_path_pathlib(self): def test_pickle_path_localpath(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" reader = lambda x: read_stata(x).set_index("index") diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ede38ce9c9a09..e6ae3c1a39bd0 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1207,10 +1207,8 @@ def test_out_of_bounds_errors_ignore2(self): # GH#12424 msg = "errors='ignore' is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - res = to_datetime( - Series(["2362-01-01", np.nan], dtype=object), errors="ignore" - ) - exp = Series(["2362-01-01", np.nan], dtype=object) + res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + exp = Series(["2362-01-01", np.nan]) tm.assert_series_equal(res, exp) def test_to_datetime_tz(self, cache): From 05f0acbc4e00e9f6c642eed35b3f4c2925d9e4b0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Aug 2024 21:31:40 +0200 Subject: [PATCH 234/396] String dtype: still return nullable NA-variant in object inference (`maybe_converts_object`) if requested (#59487) * String dtype: maybe_converts_object give precedence to nullable dtype * update datetimelike input validation * update tests and remove xfails * explicitly test pd.array() behaviour (remove xfail) * fixup allow_2d * undo changes related to datetimelike input validation * fix test for str on current main --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .../tests/arrays/string_/test_string_arrow.py | 5 +-- pandas/tests/arrays/test_array.py | 40 ++++++++++++++++++- pandas/tests/arrays/test_datetimelike.py | 7 +--- pandas/tests/base/test_value_counts.py | 4 +- .../dtypes/cast/test_construct_ndarray.py | 2 +- .../io/parser/usecols/test_usecols_basic.py | 3 -- 6 files changed, 46 insertions(+), 15 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 72d672ba8a7d9..e6957feecf4b5 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -36,9 +36,8 @@ def test_config(string_storage, using_infer_string): result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype( - string_storage, na_value=np.nan if using_infer_string else pd.NA - ) + # pd.array(..) by default always returns the NA-variant + dtype = StringDtype(string_storage, na_value=pd.NA) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 76b42b643ee69..158a963845b06 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -218,6 +218,15 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + "str", + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)) + if using_string_dtype() + else NumpyExtensionArray(np.array(["a", "None"])), + ), ( ["a", None], pd.StringDtype(), @@ -225,6 +234,29 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)), + ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype(), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + ), # Boolean ( [True, None], @@ -277,7 +309,6 @@ def test_array_copy(): cet = pytz.timezone("CET") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, expected", [ @@ -370,6 +401,13 @@ def test_array_copy(): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), # Boolean ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")), ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index b346294a892d3..ede81264cb415 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -295,9 +295,7 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings( - self, arr1d, box, string_storage, using_infer_string - ): + def test_searchsorted_castable_strings(self, arr1d, box, string_storage): arr = arr1d if box is None: pass @@ -333,8 +331,7 @@ def test_searchsorted_castable_strings( TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got " - f"{'str' if using_infer_string else 'string'} array instead." + "or array of those. Got string array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 2729666398877..1f643f24ed5f7 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -127,7 +127,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string): else: exp = np.unique(np.array(s_values, dtype=np.object_)) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 4 @@ -205,7 +205,7 @@ def test_value_counts_bins(index_or_series, using_infer_string): else: exp = np.array(["a", "b", np.nan, "d"], dtype=object) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 3 diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index ab468c81124bc..6b9b2dfda6e8b 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na( ): result = sanitize_array(values, index=None, dtype=dtype) if using_infer_string and expected.dtype == object and dtype is None: - tm.assert_extension_array_equal(result, pd.array(expected)) + tm.assert_extension_array_equal(result, pd.array(expected, dtype="str")) else: tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 24937de163662..767fba666e417 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserError from pandas import ( @@ -547,7 +545,6 @@ def test_usecols_additional_columns_integer_columns(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtype(all_parsers): parser = all_parsers data = """ From 0a2981a7aff8d73c94922f5fe008a7be221c8a09 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 Sep 2024 13:36:31 +0200 Subject: [PATCH 235/396] Enable CoW in the string test build --- .github/workflows/unit-tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index eef899173403b..bd7da3a804634 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -89,9 +89,11 @@ jobs: - name: "Future infer strings" env_file: actions-312.yaml pandas_future_infer_string: "1" + pandas_copy_on_write: "1" - name: "Future infer strings (without pyarrow)" env_file: actions-311.yaml pandas_future_infer_string: "1" + pandas_copy_on_write: "1" - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" From baefc5c2668c4c230cc656b053e7de42cf271e30 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 23 Sep 2024 20:27:32 -0400 Subject: [PATCH 236/396] Skip test if pyarrow not installed in test_numeric_only --- pandas/tests/groupby/test_numeric_only.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 029d322e4fdc3..b1fa541d42086 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -275,7 +275,7 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_str "has no kernel", ) if using_infer_string: - import pyarrow as pa + pa = pytest.importorskip("pyarrow") errs = (TypeError, pa.lib.ArrowNotImplementedError) else: From 3dc222d2c006a0fc6879f831d3b2cf79cdb1301a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Sep 2024 18:39:05 -0400 Subject: [PATCH 237/396] pick out stringarray keepdims changes from #59234 --- pandas/core/arrays/string_.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a64be0b197494..1aa6fb70d250c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -736,7 +736,13 @@ def astype(self, dtype, copy: bool = True): return super().astype(dtype, copy) def _reduce( - self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs + self, + name: str, + *, + skipna: bool = True, + keepdims: bool = False, + axis: AxisInt | None = 0, + **kwargs, ): if self.dtype.na_value is np.nan and name in ["any", "all"]: if name == "any": @@ -745,8 +751,10 @@ def _reduce( return nanops.nanall(self._ndarray, skipna=skipna) if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna, axis=axis) - + result = getattr(self, name)(skipna=skipna, axis=axis) + if keepdims: + return self._from_sequence([result], dtype=self.dtype) + return result raise TypeError(f"Cannot perform reduction '{name}' with string dtype") def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: From 4f628e88bc76b3ef02131f5b7c26cab507b7ba2a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 15:33:55 +0200 Subject: [PATCH 238/396] Fix: avoid object dtype inference warning in to_datetime --- pandas/core/tools/datetimes.py | 5 +++++ pandas/tests/tools/test_to_datetime.py | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 05262c235568d..8f700cfa63132 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -16,6 +16,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( lib, tslib, @@ -476,6 +478,9 @@ def _array_strptime_with_fallback( unit = np.datetime_data(result.dtype)[0] res = Index(result, dtype=f"M8[{unit}, UTC]", name=name) return res + elif using_string_dtype() and result.dtype == object: + if lib.is_string_array(result): + return Index(result, dtype="str", name=name) return Index(result, dtype=result.dtype, name=name) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index e6ae3c1a39bd0..e7e8f3ac63cd1 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1492,7 +1492,9 @@ def test_datetime_invalid_index(self, values, format): warn, match="Could not infer format", raise_on_extra_warnings=False ): res = to_datetime(values, errors="ignore", format=format) - tm.assert_index_equal(res, Index(values, dtype=object)) + tm.assert_index_equal( + res, Index(values, dtype="object" if format is None else "str") + ) with tm.assert_produces_warning( warn, match="Could not infer format", raise_on_extra_warnings=False @@ -3713,7 +3715,7 @@ def test_to_datetime_mixed_not_necessarily_iso8601_raise(): ("errors", "expected"), [ ("coerce", DatetimeIndex(["2020-01-01 00:00:00", NaT])), - ("ignore", Index(["2020-01-01", "01-01-2000"], dtype=object)), + ("ignore", Index(["2020-01-01", "01-01-2000"], dtype="str")), ], ) def test_to_datetime_mixed_not_necessarily_iso8601_coerce(errors, expected): From 39260a01abda6af787d17b4d78283965f9f317d0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 15:43:57 +0200 Subject: [PATCH 239/396] xfail tests that trigger dtype inference warnings --- pandas/tests/frame/indexing/test_indexing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 2ab518405503c..7a7586961deca 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -906,6 +906,8 @@ def test_setitem_frame_float(self, float_frame): expected = piece.values tm.assert_almost_equal(result, expected) + # dtype inference + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_frame_mixed(self, float_string_frame): # GH 3216 @@ -918,6 +920,8 @@ def test_setitem_frame_mixed(self, float_string_frame): f.loc[key] = piece tm.assert_almost_equal(f.loc[f.index[0:2], ["A", "B"]].values, piece.values) + # dtype inference + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_frame_mixed_rows_unaligned(self, float_string_frame): # GH#3216 rows unaligned f = float_string_frame.copy() @@ -932,6 +936,8 @@ def test_setitem_frame_mixed_rows_unaligned(self, float_string_frame): f.loc[f.index[0:2:], ["A", "B"]].values, piece.values[0:2] ) + # dtype inference + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_frame_mixed_key_unaligned(self, float_string_frame): # GH#3216 key is unaligned with values f = float_string_frame.copy() From 91e65b6ef077b2d400580e4665305ef1210014d2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 16:09:05 +0200 Subject: [PATCH 240/396] avoid dtype inference warnings by removing explicit dtype=object --- pandas/tests/dtypes/test_missing.py | 2 +- pandas/tests/frame/methods/test_set_index.py | 4 ++-- pandas/tests/series/indexing/test_getitem.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index e1f8d8eca2537..e3d3e98ae2b93 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -131,7 +131,7 @@ def test_isna_isnull(self, isna_f): [ np.arange(4, dtype=float), [0.0, 1.0, 0.0, 1.0], - Series(list("abcd"), dtype=object), + Series(list("abcd")), date_range("2020-01-01", periods=4), ], ) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 5724f79b82578..1c8d365f0d6c0 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -158,8 +158,8 @@ def test_set_index(self, float_string_frame): def test_set_index_names(self): df = DataFrame( np.ones((10, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(10)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(10)]), ) df.index.name = "name" diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 9783dcd2fea07..9891684e9597c 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -360,7 +360,7 @@ def test_getitem_no_matches(self, box): # GH#33462 we expect the same behavior for list/ndarray/Index/Series ser = Series(["A", "B"]) - key = Series(["C"], dtype=object) + key = Series(["C"]) key = box(key) msg = r"None of \[Index\(\['C'\], dtype='object|str'\)\] are in the \[index\]" From 380372fee75d7db9ea99e29d13d70e40f9bb6fdd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 16:40:59 +0200 Subject: [PATCH 241/396] un-xfail tests for replace/fillna downcasting --- pandas/tests/frame/methods/test_combine_first.py | 3 --- pandas/tests/frame/methods/test_fillna.py | 2 -- pandas/tests/frame/methods/test_replace.py | 3 --- pandas/tests/series/methods/test_replace.py | 1 - 4 files changed, 9 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index a4ee0b08e1e66..8aeab5dacd8b4 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import is_dtype_equal @@ -32,7 +30,6 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_combine_first(self, float_frame, using_infer_string): # disjoint head, tail = float_frame[:5], float_frame[5:] diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index d767e35878b52..e2baa2567f5b4 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -126,7 +126,6 @@ def test_fillna_empty(self, using_copy_on_write): df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_different_dtype(self, using_infer_string): # with different dtype (GH#3386) df = DataFrame( @@ -371,7 +370,6 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_dtype_conversion(self, using_infer_string): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index fd8039975a514..0884c091ba96a 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -624,7 +624,6 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_mixed2(self, using_infer_string): # to object block upcasting df = DataFrame( @@ -1444,7 +1443,6 @@ def test_replace_ea_ignore_float(self, frame_or_series, value): result = obj.replace(1.0, 0.0) tm.assert_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1540,7 +1538,6 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_intervals(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index c6727e023e786..850740fac907d 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -760,7 +760,6 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_change_dtype_series(self, using_infer_string): # GH#25797 df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) From 13bf07a144a50883ce2bd3d9a42436b9305ac966 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 16:51:40 +0200 Subject: [PATCH 242/396] xfail tests triggering empty concat warning --- pandas/tests/reshape/concat/test_empty.py | 4 ++++ pandas/tests/reshape/merge/test_join.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 9560087615123..8f7ea0c42f2c3 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -238,6 +240,8 @@ def test_concat_empty_dataframe_dtypes(self): assert result["b"].dtype == np.float64 assert result["c"].dtype == np.float64 + # triggers warning about empty entries + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_inner_join_empty(self): # GH 15328 df_empty = DataFrame() diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 91f0cf6c31085..9188521c71158 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -341,6 +343,8 @@ def test_join_index_mixed_overlap(self): expected = _join_by_hand(df1, df2) tm.assert_frame_equal(joined, expected) + # triggers warning about empty entries + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() From 33072d0ef90185560f528aa1fb894e07d636f767 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 21:17:36 +0200 Subject: [PATCH 243/396] Update xfails for 2.3.x --- pandas/tests/arrays/categorical/test_analytics.py | 10 +--------- pandas/tests/copy_view/test_array.py | 3 --- pandas/tests/copy_view/test_interp_fillna.py | 4 +--- pandas/tests/frame/methods/test_info.py | 3 +-- pandas/tests/groupby/aggregate/test_cython.py | 1 - pandas/tests/indexes/base_class/test_reshape.py | 5 ++++- pandas/tests/indexing/test_loc.py | 3 +++ pandas/tests/io/parser/common/test_common_basic.py | 3 ++- pandas/tests/series/methods/test_replace.py | 1 + pandas/tests/series/test_logical_ops.py | 4 +++- 10 files changed, 16 insertions(+), 21 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index a38814ca43773..c2c53fbc4637e 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -4,12 +4,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import ( - HAS_PYARROW, - PYPY, -) +from pandas.compat import PYPY from pandas import ( Categorical, @@ -301,9 +296,6 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) def test_memory_usage(self): cat = Categorical([1, 2, 3]) diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 5d0efdc149004..9a3f83e0293f5 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -159,7 +157,6 @@ def test_dataframe_array_ea_dtypes(using_copy_on_write): assert arr.flags.writeable is True -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dataframe_array_string_dtype(using_copy_on_write, using_array_manager): df = DataFrame({"a": ["a", "b"]}, dtype="string") arr = np.asarray(df) diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 1f7f8689d0779..338b76cbf1e7a 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -3,8 +3,6 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW - from pandas import ( NA, ArrowDtype, @@ -161,7 +159,7 @@ def test_interpolate_cleaned_fill_method(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_interpolate_object_convert_no_op(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) arr_a = get_array(df, "a") diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 4594f725b43d5..475632667a87a 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -10,7 +10,6 @@ from pandas._config import using_string_dtype from pandas.compat import ( - HAS_PYARROW, IS64, PYPY, ) @@ -521,7 +520,7 @@ def test_info_int_columns(): assert result == expected -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index fbbace54a3444..fa8a6cb4120b2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -164,7 +164,6 @@ def test_cython_agg_return_dict(): tm.assert_series_equal(ts, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 814a6a516904b..6a544e448ebe1 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -4,6 +4,7 @@ import numpy as np import pytest +import pandas as pd from pandas import Index import pandas._testing as tm @@ -35,7 +36,9 @@ def test_insert(self): null_index = Index([]) tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a")) - def test_insert_missing(self, nulls_fixture, using_infer_string): + def test_insert_missing(self, request, nulls_fixture, using_infer_string): + if using_infer_string and nulls_fixture is pd.NA: + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) # GH#22295 # test there is no mangling of NA values expected = Index(["a", nulls_fixture, "b", "c"], dtype=object) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 4d2d1e336ef07..d61b2ea642439 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1443,6 +1443,9 @@ def test_loc_setitem_listlike_with_timedelta64index(self, indexer, expected): tm.assert_frame_equal(expected, df) + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_loc_setitem_categorical_values_partial_column_slice(self): # Assigning a Category to parts of a int/... column uses the values of # the Categorical diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 95a7664304889..2abca1bf52374 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -14,6 +14,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, ParserError, @@ -917,7 +918,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 850740fac907d..c59dbc4ed95d7 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -391,6 +391,7 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "categorical, numeric", [ diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index ff21427b71cf9..a9f1726afc942 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -368,7 +368,9 @@ def test_reverse_ops_with_index(self, op, expected): result = op(ser, idx) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" + ) def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based From e14e99a877039f99c55f5c1405c5906181db3c1f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 21:50:50 +0200 Subject: [PATCH 244/396] Fix string dtype comparison in value_counts dtype inference deprecation --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 15a07da76d2f7..56600bd9a5107 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -935,7 +935,7 @@ def value_counts_internal( idx = idx.astype(object) elif ( idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 - and idx.dtype != "string[pyarrow_numpy]" + and idx.dtype != "string" ): warnings.warn( # GH#56161 From 0537c90f2576c123a8fb97faa6fc6420592ebfbd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 22:15:02 +0200 Subject: [PATCH 245/396] string[pyarrow_numpy] -> str --- pandas/tests/frame/methods/test_replace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 0884c091ba96a..ccee7ca24bd3d 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -452,7 +452,7 @@ def test_regex_replace_string_types( ) with tm.assert_produces_warning(FutureWarning, match="Downcasting"): result = obj.replace(to_replace, regex=True) - dtype = "string[pyarrow_numpy]" + dtype = "str" else: result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) From e825b0e5233332a6cc9f014afa316be549061526 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Oct 2024 09:33:07 +0200 Subject: [PATCH 246/396] Fix cow ref tracking in replace with list and regex --- pandas/core/internals/blocks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cd1639188b1ad..917a65348b7a3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1208,6 +1208,7 @@ def _replace_coerce( value, inplace=inplace, mask=mask, + using_cow=using_cow, ) else: if value is None: From 46fbd7f29538959c7c5f660036e49784362aaa28 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Oct 2024 15:45:42 +0200 Subject: [PATCH 247/396] suppress pylint errors --- pyproject.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 238abd85dcdb1..ac91f0dcc7269 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -476,7 +476,11 @@ disable = [ "unnecessary-lambda", "unused-argument", "unused-variable", - "using-constant-test" + "using-constant-test", + + # disabled on 2.3.x branch + "consider-using-in", + "simplifiable-if-expression", ] [tool.pytest.ini_options] From 1eb8f0ebecfff3131082871214e6cdd27f913fca Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 9 Oct 2024 12:37:07 -0700 Subject: [PATCH 248/396] Backport PR #59985 on branch 2.3.x (Programming Language :: Python :: 3.13 added to pyproject.toml) (#60012) Backport PR #59985: Programming Language :: Python :: 3.13 added to pyproject.toml Co-authored-by: LOCHAN PAUDEL <104910006+nahcol10@users.noreply.github.com> --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index ac91f0dcc7269..6cf6caec79c27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ classifiers = [ 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', 'Topic :: Scientific/Engineering' ] From c9d4b1bc58e04c75f3671d468826a1c7895a0d48 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 22 Aug 2024 11:43:57 +0200 Subject: [PATCH 249/396] String dtype: fix pyarrow-based IO + update tests (#59478) --- pandas/io/_util.py | 2 + pandas/tests/io/test_feather.py | 27 +++++++++----- pandas/tests/io/test_fsspec.py | 6 +-- pandas/tests/io/test_gcs.py | 2 +- pandas/tests/io/test_orc.py | 25 +++++++------ pandas/tests/io/test_parquet.py | 65 ++++++++++++++++++++++----------- 6 files changed, 80 insertions(+), 47 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 68fcfcf65e0c2..50a97f1059b5c 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -24,6 +24,8 @@ def _arrow_dtype_mapping() -> dict: pa.string(): pd.StringDtype(), pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), + pa.string(): pd.StringDtype(), + pa.large_string(): pd.StringDtype(), } diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index d1201686edefa..57e12747a3746 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -9,12 +9,10 @@ from pandas.io.feather_format import read_feather, to_feather # isort:skip -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + pa = pytest.importorskip("pyarrow") @@ -154,8 +152,8 @@ def test_path_localpath(self): def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() self.check_round_trip(df, write_kwargs={"version": 1}) @@ -169,7 +167,9 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) - def test_read_feather_dtype_backend(self, string_storage, dtype_backend): + def test_read_feather_dtype_backend( + self, string_storage, dtype_backend, using_infer_string + ): # GH#50765 df = pd.DataFrame( { @@ -191,7 +191,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - string_dtype = pd.ArrowDtype(pa.string()) + if using_infer_string: + string_dtype = pd.ArrowDtype(pa.large_string()) + else: + string_dtype = pd.ArrowDtype(pa.string()) else: string_dtype = pd.StringDtype(string_storage) @@ -218,6 +221,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): } ) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) def test_int_columns_and_index(self): diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 19b60e17d3a92..5ed64e3eb0958 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -168,7 +168,7 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -198,7 +198,7 @@ def test_arrowparquet_options(fsspectest): @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -256,7 +256,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") @pytest.mark.single_cpu @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet def test_s3_parquet(s3_public_bucket, s3so, df1): diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 96bc0326b23ab..81f951b3958b0 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -197,7 +197,7 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 52d6850483418..d2204a9134f90 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import read_orc import pandas._testing as tm @@ -19,12 +17,9 @@ import pyarrow as pa -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) @pytest.fixture @@ -47,7 +42,7 @@ def orc_writer_dtypes_not_supported(request): return pd.DataFrame({"unimpl": request.param}) -def test_orc_reader_empty(dirpath): +def test_orc_reader_empty(dirpath, using_infer_string): columns = [ "boolean1", "byte1", @@ -68,11 +63,12 @@ def test_orc_reader_empty(dirpath): "float32", "float64", "object", - "object", + "str" if using_infer_string else "object", ] expected = pd.DataFrame(index=pd.RangeIndex(0)) for colname, dtype in zip(columns, dtypes): expected[colname] = pd.Series(dtype=dtype) + expected.columns = expected.columns.astype("str") inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") got = read_orc(inputfile, columns=columns) @@ -309,7 +305,7 @@ def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported): orc_writer_dtypes_not_supported.to_orc() -def test_orc_dtype_backend_pyarrow(): +def test_orc_dtype_backend_pyarrow(using_infer_string): pytest.importorskip("pyarrow") df = pd.DataFrame( { @@ -340,6 +336,13 @@ def test_orc_dtype_backend_pyarrow(): for col in df.columns } ) + if using_infer_string: + # ORC does not preserve distinction between string and large string + # -> the default large string comes back as string + string_dtype = pd.ArrowDtype(pa.string()) + expected["string"] = expected["string"].astype(string_dtype) + expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype) + expected["string_with_none"] = expected["string_with_none"].astype(string_dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 59662ec77d52f..578c0949a6c97 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -55,7 +55,6 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @@ -64,11 +63,18 @@ params=[ pytest.param( "fastparquet", - marks=pytest.mark.skipif( - not _HAVE_FASTPARQUET - or _get_option("mode.data_manager", silent=True) == "array", - reason="fastparquet is not installed or ArrayManager is used", - ), + marks=[ + pytest.mark.skipif( + not _HAVE_FASTPARQUET + or _get_option("mode.data_manager", silent=True) == "array", + reason="fastparquet is not installed or ArrayManager is used", + ), + pytest.mark.xfail( + using_string_dtype(), + reason="TODO(infer_string) fastparquet", + strict=False, + ), + ], ), pytest.param( "pyarrow", @@ -90,17 +96,24 @@ def pa(): @pytest.fixture -def fp(): +def fp(request): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") elif _get_option("mode.data_manager", silent=True) == "array": pytest.skip("ArrayManager is not supported with fastparquet") + if using_string_dtype(): + request.applymarker( + pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False) + ) return "fastparquet" @pytest.fixture def df_compat(): - return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}) + # TODO(infer_string) should this give str columns? + return pd.DataFrame( + {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object) + ) @pytest.fixture @@ -389,16 +402,6 @@ def check_external_error_on_write(self, df, engine, exc): with tm.external_error_raised(exc): to_parquet(df, path, engine, compression=None) - @pytest.mark.network - @pytest.mark.single_cpu - def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): - if engine != "auto": - pytest.importorskip(engine) - with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: - httpserver.serve_content(content=f.read()) - df = read_parquet(httpserver.url) - tm.assert_frame_equal(df, df_compat) - class TestBasic(Base): def test_error(self, engine): @@ -696,6 +699,16 @@ def test_read_empty_array(self, pa, dtype): df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected ) + @pytest.mark.network + @pytest.mark.single_cpu + def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): + if engine != "auto": + pytest.importorskip(engine) + with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: + httpserver.serve_content(content=f.read()) + df = read_parquet(httpserver.url, engine=engine) + tm.assert_frame_equal(df, df_compat) + class TestParquetPyArrow(Base): def test_basic(self, pa, df_full): @@ -925,7 +938,7 @@ def test_write_with_schema(self, pa): out_df = df.astype(bool) check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) - def test_additional_extension_arrays(self, pa): + def test_additional_extension_arrays(self, pa, using_infer_string): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol pytest.importorskip("pyarrow") @@ -936,17 +949,25 @@ def test_additional_extension_arrays(self, pa): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - check_round_trip(df, pa) + if using_infer_string: + check_round_trip(df, pa, expected=df.astype({"c": "str"})) + else: + check_round_trip(df, pa) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) check_round_trip(df, pa) - def test_pyarrow_backed_string_array(self, pa, string_storage): + def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string): # test ArrowStringArray supported through the __arrow_array__ protocol pytest.importorskip("pyarrow") df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): - check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) + if using_infer_string: + expected = df.astype("str") + expected.columns = expected.columns.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") + check_round_trip(df, pa, expected=expected) def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the From 60175cc5b706b701d07235986c08ada2e6087879 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 22 Aug 2024 13:55:02 -0700 Subject: [PATCH 250/396] REF (string): avoid copy in StringArray factorize (#59551) * REF: avoid copy in StringArray factorize * mypy fixup * un-xfail --- pandas/_libs/arrays.pyx | 4 ++++ pandas/_libs/hashtable.pyx | 5 ++++- pandas/_libs/hashtable_class_helper.pxi.in | 18 +++++++++++++++--- pandas/core/arrays/_mixins.py | 19 ++++++++----------- pandas/core/arrays/categorical.py | 5 ----- pandas/core/arrays/numpy_.py | 3 --- pandas/core/arrays/string_.py | 12 +++--------- pandas/tests/groupby/test_groupby_dropna.py | 3 --- pandas/tests/window/test_rolling.py | 6 ------ 9 files changed, 34 insertions(+), 41 deletions(-) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 9889436a542c1..2932f3ff56396 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -67,6 +67,10 @@ cdef class NDArrayBacked: """ Construct a new ExtensionArray `new_array` with `arr` as its _ndarray. + The returned array has the same dtype as self. + + Caller is responsible for ensuring `values.dtype == self._ndarray.dtype`. + This should round-trip: self == self._from_backing_data(self._ndarray) """ diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ccac3d0b50d45..127b0b845d219 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -33,7 +33,10 @@ from pandas._libs.khash cimport ( kh_python_hash_func, khiter_t, ) -from pandas._libs.missing cimport checknull +from pandas._libs.missing cimport ( + checknull, + is_matching_na, +) def get_hashtable_trace_domain(): diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c0723392496c1..c42bccb7f38f7 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1121,11 +1121,13 @@ cdef class StringHashTable(HashTable): const char **vecs khiter_t k bint use_na_value + bint non_null_na_value if return_inverse: labels = np.zeros(n, dtype=np.intp) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None + non_null_na_value = not checknull(na_value) # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) @@ -1134,7 +1136,12 @@ cdef class StringHashTable(HashTable): if (ignore_na and (not isinstance(val, str) - or (use_na_value and val == na_value))): + or (use_na_value and ( + (non_null_na_value and val == na_value) or + (not non_null_na_value and is_matching_na(val, na_value))) + ) + ) + ): # if missing values do not count as unique values (i.e. if # ignore_na is True), we can skip the actual value, and # replace the label with na_sentinel directly @@ -1400,10 +1407,11 @@ cdef class PyObjectHashTable(HashTable): object val khiter_t k bint use_na_value - + bint non_null_na_value if return_inverse: labels = np.empty(n, dtype=np.intp) use_na_value = na_value is not None + non_null_na_value = not checknull(na_value) for i in range(n): val = values[i] @@ -1411,7 +1419,11 @@ cdef class PyObjectHashTable(HashTable): if ignore_na and ( checknull(val) - or (use_na_value and val == na_value) + or (use_na_value and ( + (non_null_na_value and val == na_value) or + (not non_null_na_value and is_matching_na(val, na_value)) + ) + ) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, and diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 0da121c36644a..cb6861a8dd00f 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -515,17 +515,14 @@ def _quantile( fill_value = self._internal_fill_value res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation) - - res_values = self._cast_quantile_result(res_values) - return self._from_backing_data(res_values) - - # TODO: see if we can share this with other dispatch-wrapping methods - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - """ - Cast the result of quantile_with_mask to an appropriate dtype - to pass to _from_backing_data in _quantile. - """ - return res_values + if res_values.dtype == self._ndarray.dtype: + return self._from_backing_data(res_values) + else: + # e.g. test_quantile_empty we are empty integer dtype and res_values + # has floating dtype + # TODO: technically __init__ isn't defined here. + # Should we raise NotImplementedError and handle this on NumpyEA? + return type(self)(res_values) # type: ignore[call-arg] # ------------------------------------------------------------------------ # numpy-like methods diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f191f7277743f..6ffc0df243130 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2475,11 +2475,6 @@ def unique(self) -> Self: # pylint: disable=useless-parent-delegation return super().unique() - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: - # make sure we have correct itemsize for resulting codes - assert res_values.dtype == self._ndarray.dtype - return res_values - def equals(self, other: object) -> bool: """ Returns True if categorical arrays are equal. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 03712f75db0c7..aafcd82114b97 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -137,9 +137,6 @@ def _from_sequence( result = result.copy() return cls(result) - def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray: - return type(self)(arr) - # ------------------------------------------------------------------------ # Data diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1aa6fb70d250c..fa1e5e605e16e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -657,11 +657,10 @@ def __arrow_array__(self, type=None): values[self.isna()] = None return pa.array(values, type=type, from_pandas=True) - def _values_for_factorize(self): + def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override] arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = None - return arr, None + + return arr, self.dtype.na_value def __setitem__(self, key, value) -> None: value = extract_array(value, extract_numpy=True) @@ -871,8 +870,3 @@ def _from_sequence( if dtype is None: dtype = StringDtype(storage="python", na_value=np.nan) return super()._from_sequence(scalars, dtype=dtype, copy=copy) - - def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: - # need to override NumpyExtensionArray._from_backing_data to ensure - # we always preserve the dtype - return NDArrayBacked._from_backing_data(self, arr) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d843a992daee0..3856a0d8928a7 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -388,9 +388,6 @@ def test_groupby_dropna_with_multiindex_input(input_index, keys, series): tm.assert_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_groupby_nan_included(): # GH 35646 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index acf636616421f..f353a7fa2f0fe 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -6,10 +6,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( - HAS_PYARROW, IS64, is_platform_arm, is_platform_power, @@ -1423,9 +1420,6 @@ def test_rolling_corr_timedelta_index(index, window): tm.assert_almost_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_groupby_rolling_nan_included(): # GH 35542 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} From daa46c1a8aff548c307c960a341eaa430caba0d8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 27 Aug 2024 17:51:42 +0200 Subject: [PATCH 251/396] String dtype: avoid surfacing pyarrow exception in binary operations (#59610) --- pandas/core/arrays/arrow/array.py | 40 ++++++++++++++--- pandas/core/arrays/string_.py | 5 ++- pandas/tests/arithmetic/test_object.py | 25 +++-------- .../tests/arrays/boolean/test_arithmetic.py | 26 +++-------- .../tests/arrays/floating/test_arithmetic.py | 23 ++++------ .../tests/arrays/integer/test_arithmetic.py | 34 +++++--------- pandas/tests/extension/base/ops.py | 10 +---- .../tests/extension/decimal/test_decimal.py | 2 +- pandas/tests/extension/test_arrow.py | 38 +++------------- pandas/tests/extension/test_string.py | 32 ++++--------- pandas/tests/frame/test_logical_ops.py | 20 +++------ pandas/tests/frame/test_unary.py | 26 +++-------- pandas/tests/indexes/object/test_indexing.py | 45 ++++++------------- pandas/tests/indexes/test_old_base.py | 14 ++---- pandas/tests/series/test_arithmetic.py | 26 +++-------- pandas/tests/series/test_logical_ops.py | 36 +++++---------- 16 files changed, 129 insertions(+), 273 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 46f2cbb2ebeef..5f8963c81b0ba 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -676,7 +676,12 @@ def __invert__(self) -> Self: return type(self)(pc.invert(self._pa_array)) def __neg__(self) -> Self: - return type(self)(pc.negate_checked(self._pa_array)) + try: + return type(self)(pc.negate_checked(self._pa_array)) + except pa.ArrowNotImplementedError as err: + raise TypeError( + f"unary '-' not supported for dtype '{self.dtype}'" + ) from err def __pos__(self) -> Self: return type(self)(self._pa_array) @@ -731,8 +736,19 @@ def _cmp_method(self, other, op): ) return ArrowExtensionArray(result) - def _evaluate_op_method(self, other, op, arrow_funcs): + def _op_method_error_message(self, other, op) -> str: + if hasattr(other, "dtype"): + other_type = f"dtype '{other.dtype}'" + else: + other_type = f"object of type {type(other)}" + return ( + f"operation '{op.__name__}' not supported for " + f"dtype '{self.dtype}' with {other_type}" + ) + + def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pa_type = self._pa_array.type + other_original = other other = self._box_pa(other) if ( @@ -742,10 +758,15 @@ def _evaluate_op_method(self, other, op, arrow_funcs): ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) - if op is operator.add: - result = pc.binary_join_element_wise(self._pa_array, other, sep) - elif op is roperator.radd: - result = pc.binary_join_element_wise(other, self._pa_array, sep) + try: + if op is operator.add: + result = pc.binary_join_element_wise(self._pa_array, other, sep) + elif op is roperator.radd: + result = pc.binary_join_element_wise(other, self._pa_array, sep) + except pa.ArrowNotImplementedError as err: + raise TypeError( + self._op_method_error_message(other_original, op) + ) from err return type(self)(result) elif op in [operator.mul, roperator.rmul]: binary = self._pa_array @@ -777,9 +798,14 @@ def _evaluate_op_method(self, other, op, arrow_funcs): pc_func = arrow_funcs[op.__name__] if pc_func is NotImplemented: + if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): + raise TypeError(self._op_method_error_message(other_original, op)) raise NotImplementedError(f"{op.__name__} not implemented.") - result = pc_func(self._pa_array, other) + try: + result = pc_func(self._pa_array, other) + except pa.ArrowNotImplementedError as err: + raise TypeError(self._op_method_error_message(other_original, op)) from err return type(self)(result) def _logical_method(self, other, op): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fa1e5e605e16e..c04ec13dbd81c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -823,8 +823,11 @@ def _cmp_method(self, other, op): f"Lengths of operands do not match: {len(self)} != {len(other)}" ) - other = np.asarray(other) + # for array-likes, first filter out NAs before converting to numpy + if not is_array_like(other): + other = np.asarray(other) other = other[valid] + other = np.asarray(other) if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 899ea1910d055..bc0f78d3aa01a 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,9 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -318,27 +315,17 @@ def test_add(self): expected = pd.Index(["1a", "1b", "1c"]) tm.assert_index_equal("1" + index, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) - def test_sub_fail(self, using_infer_string): + def test_sub_fail(self): index = pd.Index([str(i) for i in range(10)]) - if using_infer_string: - import pyarrow as pa - - err = pa.lib.ArrowNotImplementedError - msg = "has no kernel" - else: - err = TypeError - msg = "unsupported operand type|Cannot broadcast" - with pytest.raises(err, match=msg): + msg = "unsupported operand type|Cannot broadcast|sub' not supported" + with pytest.raises(TypeError, match=msg): index - "a" - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index - index - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index - index.tolist() - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): index.tolist() - index def test_sub_object(self): diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 4dbd8eb9f5ca7..9ff690cdc914d 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -3,10 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd import pandas._testing as tm @@ -94,19 +90,8 @@ def test_op_int8(left_array, right_array, opname): # ----------------------------------------------------------------------------- -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): # invalid ops - - if using_infer_string: - import pyarrow as pa - - err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - err = TypeError - op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) @@ -116,7 +101,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "did not contain a loop with signature matching types|" "BooleanArray cannot perform the operation|" "not supported for the input types, and the inputs could not be safely coerced " - "to any supported types according to the casting rule ''safe''" + "to any supported types according to the casting rule ''safe''|" + "not supported for dtype" ) with pytest.raises(TypeError, match=msg): ops("foo") @@ -125,9 +111,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string r"unsupported operand type\(s\) for", "Concatenation operation is not implemented for NumPy arrays", "has no kernel", + "not supported for dtype", ] ) - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes @@ -140,7 +127,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "not all arguments converted during string formatting", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series("foo", index=s.index)) diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 768d3c1449fa4..009fac4c2f5ed 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import FloatingArray @@ -124,19 +122,11 @@ def test_arith_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) - if using_infer_string: - import pyarrow as pa - - errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - errs = TypeError - # invalid scalars msg = "|".join( [ @@ -152,15 +142,17 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "Concatenation operation is not implemented for NumPy arrays", "has no kernel", "not implemented", + "not supported for dtype", + "Can only string multiply by an integer", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops("foo") - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series("foo", index=s.index)) msg = "|".join( @@ -181,9 +173,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "cannot subtract DatetimeArray from ndarray", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 8aa8c2db940b4..dee3deeee0f2f 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core import ops @@ -174,19 +172,11 @@ def test_numpy_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): +def test_error_invalid_values(data, all_arithmetic_operators): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) - if using_infer_string: - import pyarrow as pa - - errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) - else: - errs = TypeError - # invalid scalars msg = "|".join( [ @@ -201,24 +191,21 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "has no kernel", "not implemented", "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops("foo") - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes str_ser = pd.Series("foo", index=s.index) # with pytest.raises(TypeError, match=msg): - if ( - all_arithmetic_operators - in [ - "__mul__", - "__rmul__", - ] - and not using_infer_string - ): # (data[~data.isna()] >= 0).all(): + if all_arithmetic_operators in [ + "__mul__", + "__rmul__", + ]: # (data[~data.isna()] >= 0).all(): res = ops(str_ser) expected = pd.Series(["foo" * x for x in data], index=s.index) expected = expected.fillna(np.nan) @@ -227,7 +214,7 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string # more-correct than np.nan here. tm.assert_series_equal(res, expected) else: - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(str_ser) msg = "|".join( @@ -242,9 +229,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string "cannot subtract DatetimeArray from ndarray", "has no kernel", "not implemented", + "not supported for dtype", ] ) - with pytest.raises(errs, match=msg): + with pytest.raises(TypeError, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index ff9f3cbed64a2..547114ecfddd0 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -24,7 +24,7 @@ class BaseOpsUtil: def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: # Find the Exception, if any we expect to raise calling # obj.__op_name__(other) @@ -39,14 +39,6 @@ def _get_expected_exception( else: result = self.frame_scalar_exc - if using_string_dtype() and result is not None: - import pyarrow as pa - - result = ( # type: ignore[assignment] - result, - pa.lib.ArrowNotImplementedError, - NotImplementedError, - ) return result def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 9907e345ada63..8afb989508e04 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -68,7 +68,7 @@ def data_for_grouping(): class TestDecimalArray(base.ExtensionTests): def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d9a3033b8380e..0e8e1809d08ac 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -800,8 +800,6 @@ def test_value_counts_returns_pyarrow_int64(self, data): _combine_le_expected_dtype = "bool[pyarrow]" - divmod_exc = NotImplementedError - def get_op_from_name(self, op_name): short_opname = op_name.strip("_") if short_opname == "rtruediv": @@ -935,10 +933,11 @@ def _is_temporal_supported(self, opname, pa_dtype): def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: + ) -> type[Exception] | tuple[type[Exception], ...] | None: if op_name in ("__divmod__", "__rdivmod__"): - return self.divmod_exc + return (NotImplementedError, TypeError) + exc: type[Exception] | tuple[type[Exception], ...] | None dtype = tm.get_dtype(obj) # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no # attribute "pyarrow_dtype" @@ -949,7 +948,7 @@ def _get_expected_exception( "__mod__", "__rmod__", }: - exc = NotImplementedError + exc = (NotImplementedError, TypeError) elif arrow_temporal_supported: exc = None elif op_name in ["__add__", "__radd__"] and ( @@ -961,10 +960,7 @@ def _get_expected_exception( or pa.types.is_integer(pa_dtype) or pa.types.is_decimal(pa_dtype) ): - # TODO: in many of these cases, e.g. non-duration temporal, - # these will *never* be allowed. Would it make more sense to - # re-raise as TypeError, more consistent with non-pyarrow cases? - exc = pa.ArrowNotImplementedError + exc = TypeError else: exc = None return exc @@ -1020,14 +1016,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) if all_arithmetic_operators == "__rmod__" and pa.types.is_binary(pa_dtype): pytest.skip("Skip testing Python string formatting") - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1042,14 +1030,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ): pytest.skip("Skip testing Python string formatting") - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1073,14 +1053,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators, request): ), ) ) - elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( - pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=TypeError, reason="Can only string multiply by an integer." - ) - ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index f800f734ec9d9..e44881a6d78ff 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -168,24 +168,15 @@ def test_fillna_no_op_returns_copy(self, data): def _get_expected_exception( self, op_name: str, obj, other - ) -> type[Exception] | None: - if op_name in ["__divmod__", "__rdivmod__"]: - if ( - isinstance(obj, pd.Series) - and cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow" - ): - # TODO: re-raise as TypeError? - return NotImplementedError - elif ( - isinstance(other, pd.Series) - and cast(StringDtype, tm.get_dtype(other)).storage == "pyarrow" - ): - # TODO: re-raise as TypeError? - return NotImplementedError - return TypeError - elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: - if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": - return NotImplementedError + ) -> type[Exception] | tuple[type[Exception], ...] | None: + if op_name in [ + "__mod__", + "__rmod__", + "__divmod__", + "__rdivmod__", + "__pow__", + "__rpow__", + ]: return TypeError elif op_name in ["__mul__", "__rmul__"]: # Can only multiply strings by integers @@ -198,11 +189,6 @@ def _get_expected_exception( "__sub__", "__rsub__", ]: - if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": - import pyarrow as pa - - # TODO: better to re-raise as TypeError? - return pa.ArrowNotImplementedError return TypeError return None diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index 2684704f86b82..f1163e994557f 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -4,10 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas import ( CategoricalIndex, DataFrame, @@ -100,9 +96,6 @@ def test_logical_ops_int_frame(self): res_ser = df1a_int["A"] | df1a_bool["A"] tm.assert_series_equal(res_ser, df1a_bool["A"]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_logical_ops_invalid(self, using_infer_string): # GH#5808 @@ -114,15 +107,12 @@ def test_logical_ops_invalid(self, using_infer_string): df1 = DataFrame("foo", index=[1], columns=["A"]) df2 = DataFrame(True, index=[1], columns=["A"]) - msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") - if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="|has no kernel"): - df1 | df2 + if using_infer_string and df1["A"].dtype.storage == "pyarrow": + msg = "operation 'or_' not supported for dtype 'str'" else: - with pytest.raises(TypeError, match=msg): - df1 | df2 + msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") + with pytest.raises(TypeError, match=msg): + df1 | df2 def test_logical_operators(self): def _check_bin_op(op): diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 8e1df679ee1b4..a76d33e922486 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -44,11 +44,6 @@ def test_neg_object(self, df, expected): tm.assert_frame_equal(-df, expected) tm.assert_series_equal(-df["a"], expected["a"]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.parametrize( "df", [ @@ -59,22 +54,13 @@ def test_neg_object(self, df, expected): def test_neg_raises(self, df, using_infer_string): msg = ( "bad operand type for unary -: 'str'|" - r"bad operand type for unary -: 'DatetimeArray'" + r"bad operand type for unary -: 'DatetimeArray'|" + "unary '-' not supported for dtype" ) - if using_infer_string and df.dtypes.iloc[0] == "string": - import pyarrow as pa - - msg = "has no kernel" - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - (-df) - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - (-df["a"]) - - else: - with pytest.raises(TypeError, match=msg): - (-df) - with pytest.raises(TypeError, match=msg): - (-df["a"]) + with pytest.raises(TypeError, match=msg): + (-df) + with pytest.raises(TypeError, match=msg): + (-df["a"]) def test_invert(self, float_frame): df = float_frame diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index add2f3f18b348..322e6677fe05d 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -3,13 +3,10 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.missing import ( NA, is_matching_na, ) -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -31,39 +28,25 @@ def test_get_indexer_strings(self, method, expected): tm.assert_numpy_array_equal(actual, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_get_indexer_strings_raises(self, using_infer_string): index = Index(["b", "c"]) - if using_infer_string: - import pyarrow as pa - - msg = "has no kernel" - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="nearest") - - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) - - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - index.get_indexer( - ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] - ) - - else: - msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" - with pytest.raises(TypeError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="nearest") + msg = "|".join( + [ + "operation 'sub' not supported for dtype 'str'", + r"unsupported operand type\(s\) for -: 'str' and 'str'", + ] + ) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") - with pytest.raises(TypeError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) - with pytest.raises(TypeError, match=msg): - index.get_indexer( - ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] - ) + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) def test_get_indexer_with_NA_values( self, unique_nulls_fixture, unique_nulls_fixture2 diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 8d859a61a2bd5..c17d4f54c36c5 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -853,7 +853,6 @@ def test_append_preserves_dtype(self, simple_index): alt = index.take(list(range(N)) * 2) tm.assert_index_equal(result, alt, check_exact=True) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_inv(self, simple_index, using_infer_string): idx = simple_index @@ -867,21 +866,14 @@ def test_inv(self, simple_index, using_infer_string): tm.assert_series_equal(res2, Series(expected)) else: if idx.dtype.kind == "f": - err = TypeError msg = "ufunc 'invert' not supported for the input types" - elif using_infer_string and idx.dtype == "string": - import pyarrow as pa - - err = pa.lib.ArrowNotImplementedError - msg = "has no kernel" else: - err = TypeError - msg = "bad operand" - with pytest.raises(err, match=msg): + msg = "bad operand|__invert__ is not supported for string dtype" + with pytest.raises(TypeError, match=msg): ~idx # check that we get the same behavior with Series - with pytest.raises(err, match=msg): + with pytest.raises(TypeError, match=msg): ~Series(idx) def test_is_boolean_is_deprecated(self, simple_index): diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 1ffc9ddca5adf..a65d7687cfb06 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas._libs.tslibs import IncompatibleFrequency @@ -214,9 +212,9 @@ def test_series_integer_mod(self, index): s1 = Series(range(1, 10)) s2 = Series("foo", index=index) - msg = "not all arguments converted during string formatting|mod not" + msg = "not all arguments converted during string formatting|'mod' not supported" - with pytest.raises((TypeError, NotImplementedError), match=msg): + with pytest.raises(TypeError, match=msg): s2 % s1 def test_add_with_duplicate_index(self): @@ -501,28 +499,14 @@ def test_ser_cmp_result_names(self, names, comparison_op): result = op(ser, cidx) assert result.name == names[2] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_comparisons(self, using_infer_string): + def test_comparisons(self): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) # it works! exp = Series([False, False, False]) - if using_infer_string: - import pyarrow as pa - - msg = "has no kernel" - # TODO(3.0) GH56008 - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - s == s2 - with tm.assert_produces_warning( - DeprecationWarning, match="comparison", check_stacklevel=False - ): - with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): - s2 == s - else: - tm.assert_series_equal(s == s2, exp) - tm.assert_series_equal(s2 == s, exp) + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) # ----------------------------------------------------------------- # Categorical Dtype Comparisons diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index a9f1726afc942..b9ddfc189edce 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -6,8 +6,6 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW - from pandas import ( ArrowDtype, DataFrame, @@ -151,10 +149,7 @@ def test_logical_operators_int_dtype_with_bool(self): expected = Series([False, True, True, True]) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) - def test_logical_operators_int_dtype_with_object(self, using_infer_string): + def test_logical_operators_int_dtype_with_object(self): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") @@ -163,14 +158,10 @@ def test_logical_operators_int_dtype_with_object(self, using_infer_string): tm.assert_series_equal(result, expected) s_abNd = Series(["a", "b", np.nan, "d"]) - if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): - s_0123 & s_abNd - else: - with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): - s_0123 & s_abNd + with pytest.raises( + TypeError, match="unsupported.* 'int' and 'str'|'rand_' not supported" + ): + s_0123 & s_abNd def test_logical_operators_bool_dtype_with_int(self): index = list("bca") @@ -368,9 +359,7 @@ def test_reverse_ops_with_index(self, op, expected): result = op(ser, idx) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based @@ -439,15 +428,12 @@ def test_logical_ops_label_based(self, using_infer_string): tm.assert_series_equal(result, a[a]) for e in [Series(["z"])]: - warn = FutureWarning if using_infer_string else None if using_infer_string: - import pyarrow as pa - - with tm.assert_produces_warning(warn, match="Operation between non"): - with pytest.raises( - pa.lib.ArrowNotImplementedError, match="has no kernel" - ): - result = a[a | e] + # TODO(infer_string) should this behave differently? + with pytest.raises( + TypeError, match="not supported for dtype|unsupported operand type" + ): + result = a[a | e] else: result = a[a | e] tm.assert_series_equal(result, a[a]) From 616ede57a1415d8498be54284e2b7d734ca22396 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 27 Aug 2024 19:51:35 +0200 Subject: [PATCH 252/396] DOC: Add whatsnew for 2.3.0 (#59625) * DOC: Add whatsnew for 2.3.0 * fix duplicate label --- doc/source/whatsnew/index.rst | 8 ++ doc/source/whatsnew/v2.3.0.rst | 177 +++++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 doc/source/whatsnew/v2.3.0.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 09d76d71c6e1b..ae96d0f8296f2 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -10,6 +10,14 @@ This is the list of changes to pandas between each release. For full details, see the `commit logs `_. For install and upgrade instructions, see :ref:`install`. +Version 2.3 +----------- + +.. toctree:: + :maxdepth: 2 + + v2.3.0 + Version 2.2 ----------- diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst new file mode 100644 index 0000000000000..d1881bf04826f --- /dev/null +++ b/doc/source/whatsnew/v2.3.0.rst @@ -0,0 +1,177 @@ +.. _whatsnew_230: + +What's new in 2.3.0 (Month XX, 2024) +------------------------------------ + +These are the changes in pandas 2.3.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_230.upcoming_changes: + +Upcoming changes in pandas 3.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +.. _whatsnew_230.enhancements: + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_230.enhancements.enhancement1: + +enhancement1 +^^^^^^^^^^^^ + + +.. _whatsnew_230.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + +.. _whatsnew_230.notable_bug_fixes.notable_bug_fix1: + +notable_bug_fix1 +^^^^^^^^^^^^^^^^ + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.deprecations: + +Deprecations +~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ +- +- + +Datetimelike +^^^^^^^^^^^^ +- +- + +Timedelta +^^^^^^^^^ +- +- + +Timezones +^^^^^^^^^ +- +- + +Numeric +^^^^^^^ +- +- + +Conversion +^^^^^^^^^^ +- +- + +Strings +^^^^^^^ +- +- + +Interval +^^^^^^^^ +- +- + +Indexing +^^^^^^^^ +- +- + +Missing +^^^^^^^ +- +- + +MultiIndex +^^^^^^^^^^ +- +- + +I/O +^^^ +- +- + +Period +^^^^^^ +- +- + +Plotting +^^^^^^^^ +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ +- +- + +Reshaping +^^^^^^^^^ +- +- + +Sparse +^^^^^^ +- +- + +ExtensionArray +^^^^^^^^^^^^^^ +- +- + +Styler +^^^^^^ +- +- + +Other +^^^^^ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_230.contributors: + +Contributors +~~~~~~~~~~~~ From a9e7d2b2f4448d7f01bf07175953831719283a32 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 27 Aug 2024 15:54:15 -0700 Subject: [PATCH 253/396] BUG (string): str.replace with negative n (#59628) * BUG (string): str.replace with negative n * update GH ref --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/string_arrow.py | 4 +--- pandas/tests/extension/test_arrow.py | 11 +++++++++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index d1881bf04826f..528226502da33 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -102,7 +102,7 @@ Conversion Strings ^^^^^^^ -- +- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) - Interval diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 91c1f20ba93c6..5c6cca41be027 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -352,9 +352,7 @@ def _str_replace( fallback_performancewarning() return super()._str_replace(pat, repl, n, case, flags, regex) - func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) - return type(self)(result) + return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex) def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0e8e1809d08ac..47d13b331843c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1840,6 +1840,17 @@ def test_str_replace_negative_n(): expected = pd.Series(["bc", ""], dtype=ArrowDtype(pa.string())) tm.assert_series_equal(expected, actual) + # Same bug for pyarrow-backed StringArray GH#59628 + ser2 = ser.astype(pd.StringDtype(storage="pyarrow")) + actual2 = ser2.str.replace("a", "", -3, True) + expected2 = expected.astype(ser2.dtype) + tm.assert_series_equal(expected2, actual2) + + ser3 = ser.astype(pd.StringDtype(storage="pyarrow", na_value=np.nan)) + actual3 = ser3.str.replace("a", "", -3, True) + expected3 = expected.astype(ser3.dtype) + tm.assert_series_equal(expected3, actual3) + def test_str_repeat_unsupported(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) From 3d1617f3d89ae76ff435ad39096359d8b6e9a149 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 28 Aug 2024 10:09:48 -0700 Subject: [PATCH 254/396] TST (string): fix xfailed groupby value_counts tests (#59632) --- .../groupby/methods/test_value_counts.py | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 51232fac7d6f6..dc986d046ca41 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -8,9 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas import ( @@ -288,7 +285,6 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("groupby", ["column", "array", "function"]) @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) @pytest.mark.parametrize( @@ -302,7 +298,16 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) def test_against_frame_and_seriesgroupby( - education_df, groupby, normalize, name, sort, ascending, as_index, frame, request + education_df, + groupby, + normalize, + name, + sort, + ascending, + as_index, + frame, + request, + using_infer_string, ): # test all parameters: # - Use column, array or function as by= parameter @@ -366,17 +371,24 @@ def test_against_frame_and_seriesgroupby( index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) index_frame["education"] = index_frame["both"].str.split("-").str.get(1) del index_frame["both"] - index_frame = index_frame.rename({0: None}, axis=1) - expected.index = MultiIndex.from_frame(index_frame) + index_frame2 = index_frame.rename({0: None}, axis=1) + expected.index = MultiIndex.from_frame(index_frame2) + + if index_frame2.columns.isna()[0]: + # with using_infer_string, the columns in index_frame as string + # dtype, which makes the rename({0: None}) above use np.nan + # instead of None, so we need to set None more explicitly. + expected.index.names = [None] + expected.index.names[1:] tm.assert_series_equal(result, expected) else: expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) expected.insert(2, "education", expected["both"].str.split("-").str.get(1)) + if using_infer_string: + expected = expected.astype({"gender": "str", "education": "str"}) del expected["both"] tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype", [ @@ -403,6 +415,7 @@ def test_compound( expected_count, expected_group_size, dtype, + using_infer_string, ): education_df = education_df.astype(dtype) education_df.columns = education_df.columns.astype(dtype) @@ -425,6 +438,11 @@ def test_compound( expected["count"] = expected_count if dtype == "string[pyarrow]": expected["count"] = expected["count"].convert_dtypes() + if using_infer_string and dtype == object: + expected = expected.astype( + {"country": "str", "gender": "str", "education": "str"} + ) + tm.assert_frame_equal(result, expected) @@ -537,9 +555,6 @@ def names_with_nulls_df(nulls_fixture): ) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, expected_data, expected_index", [ From 9cb66bfba2f928a8b30046c0d12faa7972c96896 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 28 Aug 2024 10:16:45 -0700 Subject: [PATCH 255/396] REF (string): rename result converter methods (#59626) --- pandas/core/arrays/_arrow_string_mixins.py | 8 +++++ pandas/core/arrays/arrow/array.py | 6 ++++ pandas/core/arrays/string_arrow.py | 38 +++++++++++----------- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index cc41985843574..a99c370e9d927 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -17,6 +17,14 @@ class ArrowStringArrayMixin: def __init__(self, *args, **kwargs) -> None: raise NotImplementedError + def _convert_bool_result(self, result): + # Convert a bool-dtype result to the appropriate result type + raise NotImplementedError + + def _convert_int_result(self, result): + # Convert an integer-dtype result to the appropriate result type + raise NotImplementedError + def _str_pad( self, width: int, diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5f8963c81b0ba..f976d0b3745e8 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2285,6 +2285,12 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]: for chunk in self._pa_array.iterchunks() ] + def _convert_bool_result(self, result): + return type(self)(result) + + def _convert_int_result(self, result): + return type(self)(result) + def _str_count(self, pat: str, flags: int = 0): if flags: raise NotImplementedError(f"count not implemented with {flags=}") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 5c6cca41be027..f524c8bc5d314 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -214,7 +214,7 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - def _result_converter(self, values, na=None): + def _convert_bool_result(self, values, na=None): if self.dtype.na_value is np.nan: if not isna(na): values = values.fill_null(bool(na)) @@ -296,7 +296,7 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._result_converter(result, na=na) + result = self._convert_bool_result(result, na=na) if not isna(na): result[isna(result)] = bool(na) return result @@ -318,7 +318,7 @@ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): if isinstance(pat, str): @@ -337,7 +337,7 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_replace( self, @@ -389,43 +389,43 @@ def _str_slice( def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isdigit(self): result = pc.utf8_is_digit(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_islower(self): result = pc.utf8_is_lower(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isnumeric(self): result = pc.utf8_is_numeric(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isspace(self): result = pc.utf8_is_space(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_istitle(self): result = pc.utf8_is_title(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) - return self._result_converter(result) + return self._convert_bool_result(result) def _str_len(self): result = pc.utf8_length(self._pa_array) - return self._convert_int_dtype(result) + return self._convert_int_result(result) def _str_lower(self): return type(self)(pc.utf8_lower(self._pa_array)) @@ -472,7 +472,7 @@ def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) result = pc.count_substring_regex(self._pa_array, pat) - return self._convert_int_dtype(result) + return self._convert_int_result(result) def _str_find(self, sub: str, start: int = 0, end: int | None = None): if start != 0 and end is not None: @@ -486,7 +486,7 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): result = pc.find_substring(slices, sub) else: return super()._str_find(sub, start, end) - return self._convert_int_dtype(result) + return self._convert_int_result(result) def _str_get_dummies(self, sep: str = "|"): dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) @@ -495,7 +495,7 @@ def _str_get_dummies(self, sep: str = "|"): dummies = np.vstack(dummies_pa.to_numpy()) return dummies.astype(np.int64, copy=False), labels - def _convert_int_dtype(self, result): + def _convert_int_result(self, result): if self.dtype.na_value is np.nan: if isinstance(result, pa.Array): result = result.to_numpy(zero_copy_only=False) @@ -522,7 +522,7 @@ def _reduce( result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) if name in ("argmin", "argmax") and isinstance(result, pa.Array): - return self._convert_int_dtype(result) + return self._convert_int_result(result) elif isinstance(result, pa.Array): return type(self)(result) else: @@ -540,7 +540,7 @@ def _rank( """ See Series.rank.__doc__. """ - return self._convert_int_dtype( + return self._convert_int_result( self._rank_calc( axis=axis, method=method, From d64b8d80ce258628171b1300cc24357d0cdd6ff9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 28 Aug 2024 12:52:56 -0700 Subject: [PATCH 256/396] TST (string) fix xfailed groupby tests (3) (#59642) * TST (string) fix xfailed groupby tests (3) * TST: non-pyarrow build --- pandas/tests/groupby/methods/test_describe.py | 8 ++--- pandas/tests/groupby/methods/test_nth.py | 6 ++-- pandas/tests/groupby/test_groupby_dropna.py | 16 ---------- .../tests/groupby/transform/test_transform.py | 29 +++++++++++++------ 4 files changed, 24 insertions(+), 35 deletions(-) diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index 34b046bff7c91..c80063e673b81 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -73,7 +71,6 @@ def test_series_describe_as_index(as_index, keys): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_describe_multikey(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() @@ -82,7 +79,7 @@ def test_frame_describe_multikey(tsframe): group = grouped[col].describe() # GH 17464 - Remove duplicate MultiIndex levels group_col = MultiIndex( - levels=[[col], group.columns], + levels=[Index([col], dtype=tsframe.columns.dtype), group.columns], codes=[[0] * len(group.columns), range(len(group.columns))], ) group = DataFrame(group.values, columns=group_col, index=group.index) @@ -275,7 +272,6 @@ def test_describe(self, df, gb, gni): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [int, float, object]) @pytest.mark.parametrize( "kwargs", @@ -297,5 +293,5 @@ def test_groupby_empty_dataset(dtype, kwargs): result = df.iloc[:0].groupby("A").B.describe(**kwargs) expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] - expected.index = Index([]) + expected.index = Index([], dtype=df.columns.dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index 344258257ba80..2722993ee5cdf 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -706,14 +704,14 @@ def test_first_multi_key_groupby_categorical(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("method", ["first", "last", "nth"]) def test_groupby_last_first_nth_with_none(method, nulls_fixture): # GH29645 - expected = Series(["y"]) + expected = Series(["y"], dtype=object) data = Series( [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], index=[0, 0, 0, 0, 0], + dtype=object, ).groupby(level=0) if method == "nth": diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 3856a0d8928a7..9c01e017dd29c 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -3,7 +3,6 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -13,9 +12,6 @@ from pandas.tests.groupby import get_groupby_method_args -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -59,9 +55,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( tm.assert_frame_equal(grouped, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -138,9 +131,6 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): tm.assert_frame_equal(grouped, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, idx, expected", [ @@ -216,9 +206,6 @@ def test_groupby_dataframe_slice_then_transform(dropna, index): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -300,9 +287,6 @@ def test_groupby_dropna_datetime_like_data( tm.assert_frame_equal(grouped, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "dropna, data, selected_data, levels", [ diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index a5433d5496b0b..5823656a610e5 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -5,6 +5,7 @@ from pandas._config import using_string_dtype from pandas._libs import lib +from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ensure_platform_int @@ -499,8 +500,7 @@ def test_transform_select_columns(df): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_transform_nuisance_raises(df): +def test_transform_nuisance_raises(df, using_infer_string): # case that goes through _transform_item_by_item df.columns = ["A", "B", "B", "D"] @@ -510,10 +510,16 @@ def test_transform_nuisance_raises(df): grouped = df.groupby("A") gbc = grouped["B"] - with pytest.raises(TypeError, match="Could not convert"): + msg = "Could not convert" + if using_infer_string: + if df.columns.dtype.storage == "pyarrow": + msg = "with dtype str does not support operation 'mean'" + else: + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): gbc.transform(lambda x: np.mean(x)) - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: np.mean(x)) @@ -582,8 +588,7 @@ def test_transform_coercion(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_groupby_transform_with_int(): +def test_groupby_transform_with_int(using_infer_string): # GH 3740, make sure that we might upcast on item-by-item transform # floats @@ -613,8 +618,14 @@ def test_groupby_transform_with_int(): "D": "foo", } ) + msg = "Could not convert" + if using_infer_string: + if HAS_PYARROW: + msg = "with dtype str does not support operation 'mean'" + else: + msg = "Cannot perform reduction 'mean' with string dtype" with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -626,7 +637,7 @@ def test_groupby_transform_with_int(): s = Series([2, 3, 4, 10, 5, -1]) df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"}) with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -850,7 +861,6 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "op, args, targop", @@ -901,6 +911,7 @@ def test_cython_transform_frame_column( "does not support .* operations", ".* is not supported for object dtype", "is not implemented for this dtype", + ".* is not supported for str dtype", ] ) with pytest.raises(TypeError, match=msg): From 807d8d54788f91d2f1d9ad345f0ea3b8595b538c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 29 Aug 2024 05:27:10 -0700 Subject: [PATCH 257/396] REF (string): de-duplicate str_endswith, startswith (#59568) --- pandas/core/arrays/_arrow_string_mixins.py | 48 +++++++++++++++++++++- pandas/core/arrays/arrow/array.py | 33 +-------------- pandas/core/arrays/string_arrow.py | 40 +----------------- 3 files changed, 49 insertions(+), 72 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index a99c370e9d927..9b84ddb7cfe55 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,18 +1,28 @@ from __future__ import annotations -from typing import Literal +from typing import ( + TYPE_CHECKING, + Literal, +) import numpy as np from pandas.compat import pa_version_under10p1 +from pandas.core.dtypes.missing import isna + if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc +if TYPE_CHECKING: + from collections.abc import Sized + + from pandas._typing import Scalar + class ArrowStringArrayMixin: - _pa_array = None + _pa_array: Sized def __init__(self, *args, **kwargs) -> None: raise NotImplementedError @@ -90,3 +100,37 @@ def _str_removesuffix(self, suffix: str): removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array) return type(self)(result) + + def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) + + def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f976d0b3745e8..220ce96c22a13 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2311,38 +2311,7 @@ def _str_contains( result = result.fill_null(na) return type(self)(result) - def _str_startswith(self, pat: str | tuple[str, ...], na=None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return type(self)(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na=None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) + def _result_converter(self, result): return type(self)(result) def _str_replace( diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f524c8bc5d314..a8590d3c9b526 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -284,6 +284,8 @@ def _data(self): # String methods interface _str_map = BaseStringArray._str_map + _str_startswith = ArrowStringArrayMixin._str_startswith + _str_endswith = ArrowStringArrayMixin._str_endswith def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -301,44 +303,6 @@ def _str_contains( result[isna(result)] = bool(na) return result - def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._convert_bool_result(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._convert_bool_result(result) - def _str_replace( self, pat: str | re.Pattern, From 4de42681f8351ff3dd906417c515edd1d5f3cf7f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 31 Aug 2024 09:46:27 -0700 Subject: [PATCH 258/396] DEPR (string): non-bool na for obj.str.contains (#59615) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/string_arrow.py | 8 ++++ pandas/core/strings/object_array.py | 26 +++++++++++ pandas/tests/strings/test_find_replace.py | 55 +++++++++++++++++++++-- 4 files changed, 87 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 528226502da33..8a64aa7c609d6 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -53,7 +53,7 @@ notable_bug_fix1 Deprecations ~~~~~~~~~~~~ -- +- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a8590d3c9b526..6ae6e75bbf00d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -300,6 +300,14 @@ def _str_contains( result = pc.match_substring(self._pa_array, pat, ignore_case=not case) result = self._convert_bool_result(result, na=na) if not isna(na): + if not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) result[isna(result)] = bool(na) return result diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 090e27ec58cc3..f376c239a0ce0 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -10,12 +10,14 @@ cast, ) import unicodedata +import warnings import numpy as np from pandas._libs import lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.missing import isna @@ -140,14 +142,38 @@ def _str_contains( else: upper_pat = pat.upper() f = lambda x: upper_pat in x.upper() + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na, dtype=np.dtype("bool")) def _str_startswith(self, pat, na=None): f = lambda x: x.startswith(pat) + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.startswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_endswith(self, pat, na=None): f = lambda x: x.endswith(pat) + if not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + "Allowing a non-bool 'na' in obj.str.endswith is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_replace( diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 78ce1d7418886..8c5a9b39157ea 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -4,6 +4,9 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat import HAS_PYARROW from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td @@ -167,7 +170,16 @@ def test_contains_na_kwarg_for_nullable_string_dtype( # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416 values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype) - result = values.str.contains("a", na=na, regex=regex) + + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + warn = None + if not pd.isna(na) and not isinstance(na, bool): + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + result = values.str.contains("a", na=na, regex=regex) expected = Series([True, False, False, True, expected], dtype="boolean") tm.assert_series_equal(result, expected) @@ -233,6 +245,7 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) + # TODO(infer_string) # this particular combination of events is broken on 2.3 # would require cherry picking #58483, which in turn requires #57481 # which introduce many behavioral changes @@ -241,14 +254,19 @@ def test_contains_nan(any_string_dtype): and any_string_dtype.storage == "python" and any_string_dtype.na_value is np.nan ): - result = s.str.contains("foo", na="foo") + msg = ( + "Allowing a non-bool 'na' in obj.str.contains is deprecated and " + "will raise in a future version" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) elif any_string_dtype.na_value is np.nan: expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.str.contains("foo") expected_dtype = ( @@ -263,6 +281,37 @@ def test_contains_nan(any_string_dtype): # -------------------------------------------------------------------------------------- +@pytest.mark.xfail( + using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False +) +def test_startswith_endswith_validate_na(any_string_dtype): + # GH#59615 + ser = Series( + ["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"], + dtype=any_string_dtype, + ) + + dtype = ser.dtype + if ( + isinstance(dtype, pd.StringDtype) and dtype.storage == "python" + ) or dtype == np.dtype("object"): + msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.startswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.endswith("bar", na="baz") + else: + # TODO(infer_string): don't surface pyarrow errors + import pyarrow as pa + + msg = "Could not convert 'baz' with type str: tried to convert to boolean" + with pytest.raises(pa.lib.ArrowInvalid, match=msg): + ser.str.startswith("kapow", na="baz") + with pytest.raises(pa.lib.ArrowInvalid, match=msg): + ser.str.endswith("kapow", na="baz") + + @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) From 205e637e4881fc2c59d1f100920c3184324bcd34 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 20:00:59 +0200 Subject: [PATCH 259/396] TST (string dtype): fix and clean up arrow roundtrip tests (#59678) * TST (string dtype): fix and clean up arrow roundtrip tests * fix using_infer_string --- pandas/tests/arrays/masked/test_arrow_compat.py | 11 +++-------- pandas/tests/arrays/string_/test_string.py | 14 ++++++++++---- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 31765165f5f16..293ee4095d02e 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -1,17 +1,12 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 1296cc3b5a494..4c53dabcdbf7a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -524,7 +524,6 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -543,13 +542,16 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert result["a"].dtype == "object" else: assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") + expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is result["a"].dtype.na_value -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 @@ -571,7 +573,11 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): assert result["a"].dtype == "object" else: assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") + expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) From bb7e65cd318b658d96780d835c6c4d94b0e8b38c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2024 11:47:39 -0700 Subject: [PATCH 260/396] API (string): str.center with pyarrow-backed string dtype (#59624) --- doc/source/whatsnew/v2.3.0.rst | 3 ++- pandas/core/arrays/_arrow_string_mixins.py | 20 ++++++++++++++++++-- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/strings/test_case_justify.py | 6 +----- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 8a64aa7c609d6..03355f655eb28 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -103,7 +103,8 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) -- +- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) + Interval ^^^^^^^^ diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 9b84ddb7cfe55..e8051c803676c 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial from typing import ( TYPE_CHECKING, Literal, @@ -7,7 +8,10 @@ import numpy as np -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + pa_version_under10p1, + pa_version_under17p0, +) from pandas.core.dtypes.missing import isna @@ -46,7 +50,19 @@ def _str_pad( elif side == "right": pa_pad = pc.utf8_rpad elif side == "both": - pa_pad = pc.utf8_center + if pa_version_under17p0: + # GH#59624 fall back to object dtype + from pandas import array + + obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined] + obj = array(obj_arr, dtype=object) + result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined] + return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined] + else: + # GH#54792 + # https://github.com/apache/arrow/issues/15053#issuecomment-2317032347 + lean_left = (width % 2) == 0 + pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left) else: raise ValueError( f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6ae6e75bbf00d..e4fcf6775e8f4 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -286,6 +286,7 @@ def _data(self): _str_map = BaseStringArray._str_map _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith + _str_pad = ArrowStringArrayMixin._str_pad def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -546,7 +547,6 @@ class ArrowStringArrayNumpySemantics(ArrowStringArray): _str_get = ArrowStringArrayMixin._str_get _str_removesuffix = ArrowStringArrayMixin._str_removesuffix _str_capitalize = ArrowStringArrayMixin._str_capitalize - _str_pad = ArrowStringArrayMixin._str_pad _str_title = ArrowStringArrayMixin._str_title _str_swapcase = ArrowStringArrayMixin._str_swapcase _str_slice_replace = ArrowStringArrayMixin._str_slice_replace diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index 41aedae90ca76..819556f961fa3 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -291,11 +291,7 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): - if any_string_dtype == "string[pyarrow_numpy]": - pytest.skip( - "Arrow logic is different, " - "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", - ) + # GH#54533, GH#54792 s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") From c083ed0261d9cf73f27b9be01bc29d00e7df60f2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Sep 2024 09:27:00 -0700 Subject: [PATCH 261/396] REF (string): de-duplicate str_isfoo methods (#59705) --- pandas/core/arrays/_arrow_string_mixins.py | 40 ++++++++++++++++++- pandas/core/arrays/arrow/array.py | 27 ------------- pandas/core/arrays/string_arrow.py | 46 +++++----------------- 3 files changed, 48 insertions(+), 65 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index e8051c803676c..7f3e6eb67249e 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -52,10 +52,10 @@ def _str_pad( elif side == "both": if pa_version_under17p0: # GH#59624 fall back to object dtype - from pandas import array + from pandas import array as pd_array obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined] - obj = array(obj_arr, dtype=object) + obj = pd_array(obj_arr, dtype=object) result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined] return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined] else: @@ -150,3 +150,39 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): if not isna(na): # pyright: ignore [reportGeneralTypeIssues] result = result.fill_null(na) return self._convert_bool_result(result) + + def _str_isalnum(self): + result = pc.utf8_is_alnum(self._pa_array) + return self._convert_bool_result(result) + + def _str_isalpha(self): + result = pc.utf8_is_alpha(self._pa_array) + return self._convert_bool_result(result) + + def _str_isdecimal(self): + result = pc.utf8_is_decimal(self._pa_array) + return self._convert_bool_result(result) + + def _str_isdigit(self): + result = pc.utf8_is_digit(self._pa_array) + return self._convert_bool_result(result) + + def _str_islower(self): + result = pc.utf8_is_lower(self._pa_array) + return self._convert_bool_result(result) + + def _str_isnumeric(self): + result = pc.utf8_is_numeric(self._pa_array) + return self._convert_bool_result(result) + + def _str_isspace(self): + result = pc.utf8_is_space(self._pa_array) + return self._convert_bool_result(result) + + def _str_istitle(self): + result = pc.utf8_is_title(self._pa_array) + return self._convert_bool_result(result) + + def _str_isupper(self): + result = pc.utf8_is_upper(self._pa_array) + return self._convert_bool_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 220ce96c22a13..4cd8f7f9505d6 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2411,33 +2411,6 @@ def _str_slice( pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) ) - def _str_isalnum(self): - return type(self)(pc.utf8_is_alnum(self._pa_array)) - - def _str_isalpha(self): - return type(self)(pc.utf8_is_alpha(self._pa_array)) - - def _str_isdecimal(self): - return type(self)(pc.utf8_is_decimal(self._pa_array)) - - def _str_isdigit(self): - return type(self)(pc.utf8_is_digit(self._pa_array)) - - def _str_islower(self): - return type(self)(pc.utf8_is_lower(self._pa_array)) - - def _str_isnumeric(self): - return type(self)(pc.utf8_is_numeric(self._pa_array)) - - def _str_isspace(self): - return type(self)(pc.utf8_is_space(self._pa_array)) - - def _str_istitle(self): - return type(self)(pc.utf8_is_title(self._pa_array)) - - def _str_isupper(self): - return type(self)(pc.utf8_is_upper(self._pa_array)) - def _str_len(self): return type(self)(pc.utf8_length(self._pa_array)) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e4fcf6775e8f4..a806ee86999c2 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -283,6 +283,16 @@ def _data(self): # ------------------------------------------------------------------------ # String methods interface + _str_isalnum = ArrowStringArrayMixin._str_isalnum + _str_isalpha = ArrowStringArrayMixin._str_isalpha + _str_isdecimal = ArrowStringArrayMixin._str_isdecimal + _str_isdigit = ArrowStringArrayMixin._str_isdigit + _str_islower = ArrowStringArrayMixin._str_islower + _str_isnumeric = ArrowStringArrayMixin._str_isnumeric + _str_isspace = ArrowStringArrayMixin._str_isspace + _str_istitle = ArrowStringArrayMixin._str_istitle + _str_isupper = ArrowStringArrayMixin._str_isupper + _str_map = BaseStringArray._str_map _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith @@ -360,42 +370,6 @@ def _str_slice( pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) ) - def _str_isalnum(self): - result = pc.utf8_is_alnum(self._pa_array) - return self._convert_bool_result(result) - - def _str_isalpha(self): - result = pc.utf8_is_alpha(self._pa_array) - return self._convert_bool_result(result) - - def _str_isdecimal(self): - result = pc.utf8_is_decimal(self._pa_array) - return self._convert_bool_result(result) - - def _str_isdigit(self): - result = pc.utf8_is_digit(self._pa_array) - return self._convert_bool_result(result) - - def _str_islower(self): - result = pc.utf8_is_lower(self._pa_array) - return self._convert_bool_result(result) - - def _str_isnumeric(self): - result = pc.utf8_is_numeric(self._pa_array) - return self._convert_bool_result(result) - - def _str_isspace(self): - result = pc.utf8_is_space(self._pa_array) - return self._convert_bool_result(result) - - def _str_istitle(self): - result = pc.utf8_is_title(self._pa_array) - return self._convert_bool_result(result) - - def _str_isupper(self): - result = pc.utf8_is_upper(self._pa_array) - return self._convert_bool_result(result) - def _str_len(self): result = pc.utf8_length(self._pa_array) return self._convert_int_result(result) From 77a2326d924fbf97f38e73b115055aa2003ef64c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Sep 2024 09:38:26 -0700 Subject: [PATCH 262/396] TST (string): copy/view tests (#59702) --- pandas/core/dtypes/dtypes.py | 2 +- pandas/tests/copy_view/test_constructors.py | 5 +--- pandas/tests/copy_view/test_functions.py | 32 ++++++++++----------- pandas/tests/copy_view/test_internals.py | 10 +++---- pandas/tests/dtypes/test_dtypes.py | 3 -- 5 files changed, 23 insertions(+), 29 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1c43ef55c11d7..c6ca24d19b906 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -453,7 +453,7 @@ def __eq__(self, other: object) -> bool: # Because left and right have the same length and are unique, # `indexer` not having any -1s implies that there is a # bijection between `left` and `right`. - return (indexer != -1).all() + return bool((indexer != -1).all()) # With object-dtype we need a comparison that identifies # e.g. int(2) as distinct from float(2) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 866b1964a334f..66c9b456f18ad 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -285,10 +283,9 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype): assert np.shares_memory(arr_before, arr_after) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cons", [Series, Index]) @pytest.mark.parametrize( - "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] + "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], object)] ) def test_dataframe_from_series_or_index( using_copy_on_write, warn_copy_on_write, data, dtype, cons diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index a87baaedb9244..23ed7f9edcd22 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -16,10 +16,9 @@ from pandas.tests.copy_view.util import get_array -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_concat_frames(using_copy_on_write): - df = DataFrame({"b": ["a"] * 3}) - df2 = DataFrame({"a": ["a"] * 3}) + df = DataFrame({"b": ["a"] * 3}, dtype=object) + df2 = DataFrame({"a": ["a"] * 3}, dtype=object) df_orig = df.copy() result = concat([df, df2], axis=1) @@ -41,10 +40,9 @@ def test_concat_frames(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_concat_frames_updating_input(using_copy_on_write): - df = DataFrame({"b": ["a"] * 3}) - df2 = DataFrame({"a": ["a"] * 3}) + df = DataFrame({"b": ["a"] * 3}, dtype=object) + df2 = DataFrame({"a": ["a"] * 3}, dtype=object) result = concat([df, df2], axis=1) if using_copy_on_write: @@ -203,7 +201,7 @@ def test_concat_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") +# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ @@ -212,8 +210,8 @@ def test_concat_copy_keyword(using_copy_on_write, copy): ], ) def test_merge_on_key(using_copy_on_write, func): - df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) - df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}) + df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]}) + df2 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "b": [4, 5, 6]}) df1_orig = df1.copy() df2_orig = df2.copy() @@ -267,7 +265,6 @@ def test_merge_on_index(using_copy_on_write): tm.assert_frame_equal(df2, df2_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "func, how", [ @@ -276,8 +273,8 @@ def test_merge_on_index(using_copy_on_write): ], ) def test_merge_on_key_enlarging_one(using_copy_on_write, func, how): - df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) - df2 = DataFrame({"key": ["a", "b"], "b": [4, 5]}) + df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]}) + df2 = DataFrame({"key": Series(["a", "b"], dtype=object), "b": [4, 5]}) df1_orig = df1.copy() df2_orig = df2.copy() @@ -321,9 +318,13 @@ def test_merge_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, + reason="TODO(infer_string); result.index infers str dtype while both " + "df1 and df2 index are object.", +) def test_join_on_key(using_copy_on_write): - df_index = Index(["a", "b", "c"], name="key") + df_index = Index(["a", "b", "c"], name="key", dtype=object) df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)) @@ -355,9 +356,8 @@ def test_join_on_key(using_copy_on_write): tm.assert_frame_equal(df2, df2_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_join_multiple_dataframes_on_key(using_copy_on_write): - df_index = Index(["a", "b", "c"], name="key") + df_index = Index(["a", "b", "c"], name="key", dtype=object) df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) dfs_list = [ diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 6f7198520d22e..8526d38588897 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -1,12 +1,13 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame +from pandas import ( + DataFrame, + Series, +) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -78,7 +79,6 @@ def test_switch_options(): @td.skip_array_manager_invalid_test -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [np.intp, np.int8]) @pytest.mark.parametrize( "locs, arr", @@ -105,7 +105,7 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): "c": [7, 8, 9], "d": [10, 11, 12], "e": [13, 14, 15], - "f": ["a", "b", "c"], + "f": Series(["a", "b", "c"], dtype=object), }, ) arr = arr.astype(dtype) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index a4916ed1bbd8a..a5666e169fb4c 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -5,8 +5,6 @@ import pytest import pytz -from pandas._config import using_string_dtype - from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas.core.dtypes.base import _registry as registry @@ -961,7 +959,6 @@ def test_same_categories_different_order(self): c2 = CategoricalDtype(["b", "a"], ordered=True) assert c1 is not c2 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("ordered1", [True, False, None]) @pytest.mark.parametrize("ordered2", [True, False, None]) def test_categorical_equality(self, ordered1, ordered2): From 5b571c0f9a836d4e07938a3b2ddc87a0dbd1818e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Sep 2024 12:44:42 -0700 Subject: [PATCH 263/396] TST (string): more targeted xfails in test_string.py (#59703) * TST (string): more targeted xfails in test_string.py * Fix no-pyarrow test * Update pandas/tests/extension/test_string.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/tests/extension/test_string.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/extension/test_string.py | 36 +++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e44881a6d78ff..7f04858318013 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -21,7 +21,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW import pandas as pd import pandas._testing as tm @@ -30,10 +30,6 @@ from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def maybe_split_array(arr, chunked): if not chunked: @@ -220,6 +216,36 @@ def test_compare_scalar(self, data, comparison_op): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) + def test_combine_add(self, data_repeated, using_infer_string, request): + dtype = next(data_repeated(1)).dtype + if using_infer_string and ( + (dtype.na_value is pd.NA) and dtype.storage == "python" + ): + mark = pytest.mark.xfail( + reason="The pointwise operation result will be inferred to " + "string[nan, pyarrow], which does not match the input dtype" + ) + request.applymarker(mark) + super().test_combine_add(data_repeated) + + def test_arith_series_with_array( + self, data, all_arithmetic_operators, using_infer_string, request + ): + dtype = data.dtype + if ( + using_infer_string + and all_arithmetic_operators == "__radd__" + and ( + (dtype.na_value is pd.NA) or (dtype.storage == "python" and HAS_PYARROW) + ) + ): + mark = pytest.mark.xfail( + reason="The pointwise operation result will be inferred to " + "string[nan, pyarrow], which does not match the input dtype" + ) + request.applymarker(mark) + super().test_arith_series_with_array(data, all_arithmetic_operators) + class Test2DCompat(base.Dim2CompatTests): @pytest.fixture(autouse=True) From 40d81db505ee07c5576d95f6461d691893d8d946 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Sep 2024 16:21:06 -0700 Subject: [PATCH 264/396] REF (string): de-duplicate _str_contains (#59709) * REF: de-duplicate _str_contains * pyright ignore --- pandas/core/arrays/_arrow_string_mixins.py | 15 +++++++++++++++ pandas/core/arrays/arrow/array.py | 15 --------------- pandas/core/arrays/string_arrow.py | 14 ++++---------- 3 files changed, 19 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 7f3e6eb67249e..1a90e4e876faf 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -186,3 +186,18 @@ def _str_istitle(self): def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) return self._convert_bool_result(result) + + def _str_contains( + self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + ): + if flags: + raise NotImplementedError(f"contains not implemented with {flags=}") + + if regex: + pa_contains = pc.match_substring_regex + else: + pa_contains = pc.match_substring + result = pa_contains(self._pa_array, pat, ignore_case=not case) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4cd8f7f9505d6..d4aaef7eced83 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2296,21 +2296,6 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True - ): - if flags: - raise NotImplementedError(f"contains not implemented with {flags=}") - - if regex: - pa_contains = pc.match_substring_regex - else: - pa_contains = pc.match_substring - result = pa_contains(self._pa_array, pat, ignore_case=not case) - if not isna(na): - result = result.fill_null(na) - return type(self)(result) - def _result_converter(self, result): return type(self)(result) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a806ee86999c2..a3c2659beced0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -214,10 +214,8 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - def _convert_bool_result(self, values, na=None): + def _convert_bool_result(self, values): if self.dtype.na_value is np.nan: - if not isna(na): - values = values.fill_null(bool(na)) return ArrowExtensionArray(values).to_numpy(na_value=np.nan) return BooleanDtype().__from_arrow__(values) @@ -305,11 +303,6 @@ def _str_contains( fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) - if regex: - result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) - else: - result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._convert_bool_result(result, na=na) if not isna(na): if not isinstance(na, bool): # GH#59561 @@ -319,8 +312,9 @@ def _str_contains( FutureWarning, stacklevel=find_stack_level(), ) - result[isna(result)] = bool(na) - return result + na = bool(na) + + return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) def _str_replace( self, From a065afb2ff4497bd184d91d19000b6c7849fd332 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 6 Sep 2024 08:06:15 -0700 Subject: [PATCH 265/396] BUG (string): ArrowStringArray.find corner cases (#59562) --- pandas/core/arrays/_arrow_string_mixins.py | 44 +++++++++++++++++- pandas/core/arrays/arrow/array.py | 17 ------- pandas/core/arrays/string_arrow.py | 18 +++----- pandas/tests/extension/test_arrow.py | 52 ++++++++++++++++++++-- 4 files changed, 99 insertions(+), 32 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 1a90e4e876faf..4829b175783ed 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -3,6 +3,7 @@ from functools import partial from typing import ( TYPE_CHECKING, + Any, Literal, ) @@ -10,6 +11,7 @@ from pandas.compat import ( pa_version_under10p1, + pa_version_under13p0, pa_version_under17p0, ) @@ -20,7 +22,10 @@ import pyarrow.compute as pc if TYPE_CHECKING: - from collections.abc import Sized + from collections.abc import ( + Callable, + Sized, + ) from pandas._typing import Scalar @@ -39,6 +44,9 @@ def _convert_int_result(self, result): # Convert an integer-dtype result to the appropriate result type raise NotImplementedError + def _apply_elementwise(self, func: Callable) -> list[list[Any]]: + raise NotImplementedError + def _str_pad( self, width: int, @@ -201,3 +209,37 @@ def _str_contains( if not isna(na): # pyright: ignore [reportGeneralTypeIssues] result = result.fill_null(na) return self._convert_bool_result(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 + res_list = self._apply_elementwise(lambda val: val.find(sub, start, end)) + return self._convert_int_result(pa.chunked_array(res_list)) + + if (start == 0 or start is None) and end is None: + result = pc.find_substring(self._pa_array, sub) + else: + if sub == "": + # GH#56792 + res_list = self._apply_elementwise( + lambda val: val.find(sub, start, end) + ) + return self._convert_int_result(pa.chunked_array(res_list)) + if start is None: + start_offset = 0 + start = 0 + elif start < 0: + start_offset = pc.add(start, pc.utf8_length(self._pa_array)) + start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset) + else: + start_offset = start + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + found = pc.not_equal(result, pa.scalar(-1, type=result.type)) + offset_result = pc.add(result, start_offset) + result = pc.if_else(found, offset_result, -1) + return self._convert_int_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d4aaef7eced83..861ec0c42c885 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2348,23 +2348,6 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) - def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - start_offset = max(0, start) - offset_result = pc.add(result, start_offset) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: - raise NotImplementedError( - f"find not implemented with {sub=}, {start=}, {end=}" - ) - return type(self)(result) - def _str_join(self, sep: str): if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a3c2659beced0..563be79e98cbb 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -416,18 +416,14 @@ def _str_count(self, pat: str, flags: int = 0): return self._convert_int_result(result) def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 return super()._str_find(sub, start, end) - return self._convert_int_result(result) + return ArrowStringArrayMixin._str_find(self, sub, start, end) def _str_get_dummies(self, sep: str = "|"): dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 47d13b331843c..12f3eedb6b9f1 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1925,10 +1925,56 @@ def test_str_find_negative_start(): tm.assert_series_equal(result, expected) -def test_str_find_notimplemented(): +def test_str_find_no_end(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) - with pytest.raises(NotImplementedError, match="find not implemented"): - ser.str.find("ab", start=1) + result = ser.str.find("ab", start=1) + expected = pd.Series([-1, None], dtype="int64[pyarrow]") + tm.assert_series_equal(result, expected) + + +def test_str_find_negative_start_negative_end(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=-6, end=-3) + expected = pd.Series([3, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + +def test_str_find_large_start(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=16) + expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + +@pytest.mark.skipif( + pa_version_under13p0, reason="https://github.com/apache/arrow/issues/36311" +) +@pytest.mark.parametrize("start", [-15, -3, 0, 1, 15, None]) +@pytest.mark.parametrize("end", [-15, -1, 0, 3, 15, None]) +@pytest.mark.parametrize("sub", ["", "az", "abce", "a", "caa"]) +def test_str_find_e2e(start, end, sub): + s = pd.Series( + ["abcaadef", "abc", "abcdeddefgj8292", "ab", "a", ""], + dtype=ArrowDtype(pa.string()), + ) + object_series = s.astype(pd.StringDtype(storage="python")) + result = s.str.find(sub, start, end) + expected = object_series.str.find(sub, start, end).astype(result.dtype) + tm.assert_series_equal(result, expected) + + arrow_str_series = s.astype(pd.StringDtype(storage="pyarrow")) + result2 = arrow_str_series.str.find(sub, start, end).astype(result.dtype) + tm.assert_series_equal(result2, expected) + + +def test_str_find_negative_start_negative_end_no_match(): + # GH 56791 + ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="d", start=-3, end=-6) + expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( From 62b2478dccadc242dc854cf32b4d54cff10a11dd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 Sep 2024 19:37:42 +0200 Subject: [PATCH 266/396] String dtype: implement _get_common_dtype (#59682) * String dtype: implement _get_common_dtype * add specific tests * try fix typing * try fix typing * suppress typing error * support numpy 2.0 string * fix typo --- pandas/core/arrays/string_.py | 32 ++++++++- pandas/tests/arrays/categorical/test_api.py | 3 - pandas/tests/arrays/string_/test_concat.py | 73 +++++++++++++++++++++ 3 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/arrays/string_/test_concat.py diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c04ec13dbd81c..620d549204388 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -167,9 +167,9 @@ def __init__( # a consistent NaN value (and we can use `dtype.na_value is np.nan`) na_value = np.nan elif na_value is not libmissing.NA: - raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") + raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}") - self.storage = storage + self.storage = cast(str, storage) self._na_value = na_value def __repr__(self) -> str: @@ -280,6 +280,34 @@ def construct_array_type( # type: ignore[override] else: return ArrowStringArrayNumpySemantics + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + storages = set() + na_values = set() + + for dtype in dtypes: + if isinstance(dtype, StringDtype): + storages.add(dtype.storage) + na_values.add(dtype.na_value) + elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "T"): + continue + else: + return None + + if len(storages) == 2: + # if both python and pyarrow storage -> priority to pyarrow + storage = "pyarrow" + else: + storage = next(iter(storages)) # type: ignore[assignment] + + na_value: libmissing.NAType | float + if len(na_values) == 2: + # if both NaN and NA -> priority to NA + na_value = libmissing.NA + else: + na_value = next(iter(na_values)) + + return StringDtype(storage=storage, na_value=na_value) + def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray ) -> BaseStringArray: diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 1d948b7495a43..a939ee5f6f53f 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import PY311 from pandas import ( @@ -158,7 +156,6 @@ def test_reorder_categories_raises(self, new_categories): with pytest.raises(ValueError, match=msg): cat.reorder_categories(new_categories) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() diff --git a/pandas/tests/arrays/string_/test_concat.py b/pandas/tests/arrays/string_/test_concat.py new file mode 100644 index 0000000000000..320d700b2b6c3 --- /dev/null +++ b/pandas/tests/arrays/string_/test_concat.py @@ -0,0 +1,73 @@ +import numpy as np +import pytest + +from pandas.compat import HAS_PYARROW + +from pandas.core.dtypes.cast import find_common_type + +import pandas as pd +import pandas._testing as tm +from pandas.util.version import Version + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + # same types + ([("pyarrow", pd.NA), ("pyarrow", pd.NA)], ("pyarrow", pd.NA)), + ([("pyarrow", np.nan), ("pyarrow", np.nan)], ("pyarrow", np.nan)), + ([("python", pd.NA), ("python", pd.NA)], ("python", pd.NA)), + ([("python", np.nan), ("python", np.nan)], ("python", np.nan)), + # pyarrow preference + ([("pyarrow", pd.NA), ("python", pd.NA)], ("pyarrow", pd.NA)), + # NA preference + ([("python", pd.NA), ("python", np.nan)], ("python", pd.NA)), + ], +) +def test_concat_series(request, to_concat_dtypes, result_dtype): + if any(storage == "pyarrow" for storage, _ in to_concat_dtypes) and not HAS_PYARROW: + pytest.skip("Could not import 'pyarrow'") + + ser_list = [ + pd.Series(["a", "b", None], dtype=pd.StringDtype(storage, na_value)) + for storage, na_value in to_concat_dtypes + ] + + result = pd.concat(ser_list, ignore_index=True) + expected = pd.Series( + ["a", "b", None, "a", "b", None], dtype=pd.StringDtype(*result_dtype) + ) + tm.assert_series_equal(result, expected) + + # order doesn't matter for result + result = pd.concat(ser_list[::1], ignore_index=True) + tm.assert_series_equal(result, expected) + + +def test_concat_with_object(string_dtype_arguments): + # _get_common_dtype cannot inspect values, so object dtype with strings still + # results in object dtype + result = pd.concat( + [ + pd.Series(["a", "b", None], dtype=pd.StringDtype(*string_dtype_arguments)), + pd.Series(["a", "b", None], dtype=object), + ] + ) + assert result.dtype == np.dtype("object") + + +def test_concat_with_numpy(string_dtype_arguments): + # common type with a numpy string dtype always preserves the pandas string dtype + dtype = pd.StringDtype(*string_dtype_arguments) + assert find_common_type([dtype, np.dtype("U")]) == dtype + assert find_common_type([np.dtype("U"), dtype]) == dtype + assert find_common_type([dtype, np.dtype("U10")]) == dtype + assert find_common_type([np.dtype("U10"), dtype]) == dtype + + # with any other numpy dtype -> object + assert find_common_type([dtype, np.dtype("S")]) == np.dtype("object") + assert find_common_type([dtype, np.dtype("int64")]) == np.dtype("object") + + if Version(np.__version__) >= Version("2"): + assert find_common_type([dtype, np.dtypes.StringDType()]) == dtype + assert find_common_type([np.dtypes.StringDType(), dtype]) == dtype From 2e006e776f7d1fdc901ad2b37fefa41f2e4edb76 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 9 Sep 2024 05:53:48 -0500 Subject: [PATCH 267/396] TST/BUG (string dtype): Fix and adjust indexes string tests (#59544) Co-authored-by: Joris Van den Bossche --- pandas/core/construction.py | 5 +++- pandas/core/indexes/base.py | 6 ++++- .../tests/indexes/base_class/test_setops.py | 6 ++--- pandas/tests/indexes/test_base.py | 11 ++------ pandas/tests/indexes/test_old_base.py | 26 ++++++++----------- 5 files changed, 24 insertions(+), 30 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 5bccca9cfbd47..584a1d417d198 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -609,7 +609,10 @@ def sanitize_array( dtype = StringDtype(na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) - if subarr is data and copy: + if ( + subarr is data + or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr] + ) and copy: subarr = subarr.copy() else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 825316585c03c..a28c98ecc5cee 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -506,7 +506,8 @@ def __new__( elif is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here - pass + if isinstance(data, (set, frozenset)): + data = list(data) elif is_ea_or_datetimelike_dtype(data_dtype): pass @@ -6995,6 +6996,9 @@ def insert(self, loc: int, item) -> Index: # We cannot keep the same dtype, so cast to the (often object) # minimal shared dtype before doing the insert. dtype = self._find_common_type_compat(item) + if dtype == self.dtype: + # EA's might run into recursion errors if loc is invalid + raise return self.astype(dtype).insert(loc, item) if arr.dtype != object or not isinstance( diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 2176aa52b17f4..a897e5aca058a 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Index, @@ -233,7 +231,6 @@ def test_tuple_union_bug(self, method, expected, sort): expected = Index(expected) tm.assert_index_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("first_list", [["b", "a"], []]) @pytest.mark.parametrize("second_list", [["a", "b"], []]) @pytest.mark.parametrize( @@ -243,6 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort): def test_union_name_preservation( self, first_list, second_list, first_name, second_name, expected_name, sort ): + expected_dtype = object if not first_list or not second_list else "str" first = Index(first_list, name=first_name) second = Index(second_list, name=second_name) union = first.union(second, sort=sort) @@ -253,7 +251,7 @@ def test_union_name_preservation( expected = Index(sorted(vals), name=expected_name) tm.assert_index_equal(union, expected) else: - expected = Index(vals, name=expected_name) + expected = Index(vals, name=expected_name, dtype=expected_dtype) tm.assert_index_equal(union.sort_values(), expected.sort_values()) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index cf75f95d17b0a..813446440eded 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -76,9 +76,6 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_constructor_copy(self, using_infer_string): index = Index(list("abc"), name="name") arr = np.array(index) @@ -346,11 +343,6 @@ def test_constructor_empty_special(self, empty, klass): def test_view_with_args(self, index): index.view("i8") - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.parametrize( "index", [ @@ -367,7 +359,8 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "string": + elif index.dtype == "str" and not index.dtype.storage == "python": + # TODO(infer_string): Make the errors consistent with pytest.raises(NotImplementedError, match="i8"): index.view("i8") else: diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index c17d4f54c36c5..37aa01ea046ca 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,10 +6,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp -from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ( is_integer_dtype, @@ -28,6 +25,7 @@ PeriodIndex, RangeIndex, Series, + StringDtype, TimedeltaIndex, isna, period_range, @@ -233,7 +231,6 @@ def test_logical_compat(self, simple_index): with pytest.raises(TypeError, match=msg): idx.any() - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): pytest.skip(f"Not a valid repr for {type(simple_index).__name__}") @@ -250,11 +247,6 @@ def test_repr_max_seq_item_setting(self, simple_index): repr(idx) assert "..." not in str(idx) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured @@ -302,7 +294,9 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._mask, result._values._mask, check_same="same" ) - elif index.dtype == "string[python]": + elif ( + isinstance(index.dtype, StringDtype) and index.dtype.storage == "python" + ): assert np.shares_memory(index._values._ndarray, result._values._ndarray) tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" @@ -432,11 +426,7 @@ def test_insert_base(self, index): result = trimmed.insert(0, index[0]) assert index[0:4].equals(result) - @pytest.mark.skipif( - using_string_dtype(), - reason="completely different behavior, tested elsewher", - ) - def test_insert_out_of_bounds(self, index): + def test_insert_out_of_bounds(self, index, using_infer_string): # TypeError/IndexError matches what np.insert raises in these cases if len(index) > 0: @@ -448,6 +438,12 @@ def test_insert_out_of_bounds(self, index): msg = "index (0|0.5) is out of bounds for axis 0 with size 0" else: msg = "slice indices must be integers or None or have an __index__ method" + + if using_infer_string and ( + index.dtype == "string" or index.dtype == "category" # noqa: PLR1714 + ): + msg = "loc must be an integer between" + with pytest.raises(err, match=msg): index.insert(0.5, "foo") From 22b16d7aa22dec9ccd6a1dc4d77811bc81c2d3dc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 9 Sep 2024 06:40:22 -0500 Subject: [PATCH 268/396] TST (string dtype): Adjust indexing string tests (#59541) Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_.py | 4 ++ pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/arrays/string_/test_string.py | 5 +-- pandas/tests/indexing/test_iloc.py | 31 +++++++------- pandas/tests/indexing/test_indexing.py | 18 ++++----- pandas/tests/indexing/test_loc.py | 47 ++++++++++++++-------- 6 files changed, 59 insertions(+), 48 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 620d549204388..43c46a4308f9e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -713,6 +713,10 @@ def __setitem__(self, key, value) -> None: else: if not is_array_like(value): value = np.asarray(value, dtype=object) + else: + # cast categories and friends to arrays to see if values are + # compatible, compatibility with arrow backed strings + value = np.asarray(value) if len(value) and not lib.is_string_array(value, skipna=True): raise TypeError("Must provide strings.") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 563be79e98cbb..5ed12e7352bd1 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -231,7 +231,7 @@ def _maybe_convert_setitem_value(self, value): value[isna(value)] = None for v in value: if not (v is None or isinstance(v, str)): - raise TypeError("Scalar must be NA or str") + raise TypeError("Must provide strings") return super()._maybe_convert_setitem_value(value) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 4c53dabcdbf7a..d3a0897f88f61 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -101,10 +101,7 @@ def test_setitem_validates(cls, dtype): with pytest.raises(TypeError, match=msg): arr[0] = 10 - if dtype.storage == "python": - msg = "Must provide strings." - else: - msg = "Scalar must be NA or str" + msg = "Must provide strings" with pytest.raises(TypeError, match=msg): arr[:] = np.array([1, 2]) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 3fd9498e21a73..45f63bdf1ee32 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -1218,22 +1216,25 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr[2] = arr[-1] assert ser[0] == arr[-1] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_iloc_setitem_multicolumn_to_datetime(self): + def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string): # GH#20511 df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) - df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) - expected = DataFrame( - { - "A": [ - Timestamp("2021-01-01 00:00:00"), - Timestamp("2022-01-01 00:00:00"), - ], - "B": ["2021", "2022"], - } - ) - tm.assert_frame_equal(df, expected, check_dtype=False) + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + else: + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + expected = DataFrame( + { + "A": [ + Timestamp("2021-01-01 00:00:00"), + Timestamp("2022-01-01 00:00:00"), + ], + "B": ["2021", "2022"], + } + ) + tm.assert_frame_equal(df, expected, check_dtype=False) class TestILocErrors: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index e57598cfc2be1..0ff33ba88b16f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -563,12 +561,12 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) + df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object) df = df_orig.copy() @@ -578,9 +576,9 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) + expected[list("CDG")] = expected[list("CDG")].astype(object) + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) # GH5702 (loc) @@ -589,18 +587,16 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["B"] = expected["B"].astype(object) - expected["C"] = expected["C"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d61b2ea642439..ad72be02f81b1 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1,5 +1,6 @@ """ test label based indexing with loc """ from collections import namedtuple +import contextlib from datetime import ( date, datetime, @@ -648,8 +649,9 @@ def test_loc_setitem_consistency_empty(self): expected["x"] = expected["x"].astype(np.int64) tm.assert_frame_equal(df, expected) + # incompatible dtype warning @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_loc_setitem_consistency_slice_column_len(self): + def test_loc_setitem_consistency_slice_column_len(self, using_infer_string): # .loc[:,column] setting with slice == len of the column # GH10408 levels = [ @@ -673,13 +675,24 @@ def test_loc_setitem_consistency_slice_column_len(self): ] df = DataFrame(values, index=mi, columns=cols) - df.loc[:, ("Respondent", "StartDate")] = to_datetime( - df.loc[:, ("Respondent", "StartDate")] - ) - df.loc[:, ("Respondent", "EndDate")] = to_datetime( - df.loc[:, ("Respondent", "EndDate")] - ) - df = df.infer_objects(copy=False) + ctx = contextlib.nullcontext() + if using_infer_string: + ctx = pytest.raises(TypeError, match="Invalid value") + + with ctx: + df.loc[:, ("Respondent", "StartDate")] = to_datetime( + df.loc[:, ("Respondent", "StartDate")] + ) + with ctx: + df.loc[:, ("Respondent", "EndDate")] = to_datetime( + df.loc[:, ("Respondent", "EndDate")] + ) + + if using_infer_string: + # infer-objects won't infer stuff anymore + return + + df = df.infer_objects() # Adding a new key df.loc[:, ("Respondent", "Duration")] = ( @@ -1269,20 +1282,23 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") - def test_loc_setitem_str_to_small_float_conversion_type(self): + def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string): # GH#20388 col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)] result = DataFrame(col_data, columns=["A"]) - expected = DataFrame(col_data, columns=["A"], dtype=object) + expected = DataFrame(col_data, columns=["A"]) tm.assert_frame_equal(result, expected) # assigning with loc/iloc attempts to set the values inplace, which # in this case is successful - result.loc[result.index, "A"] = [float(x) for x in col_data] - expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) - tm.assert_frame_equal(result, expected) + if using_infer_string: + with pytest.raises(TypeError, match="Must provide strings"): + result.loc[result.index, "A"] = [float(x) for x in col_data] + else: + result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) + tm.assert_frame_equal(result, expected) # assigning the entire column using __setitem__ swaps in the new array # GH#??? @@ -1458,9 +1474,6 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) From 1fd7e25a33dccd45a4ef915b884d6ca556ab4187 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 19:15:02 +0200 Subject: [PATCH 269/396] TST (string dtype): adjust pandas/tests/reshape tests (#59762) --- pandas/tests/reshape/concat/test_concat.py | 2 ++ pandas/tests/reshape/merge/test_merge_asof.py | 10 ++----- pandas/tests/reshape/test_get_dummies.py | 10 ++----- pandas/tests/reshape/test_melt.py | 25 ++++++----------- pandas/tests/reshape/test_pivot.py | 28 ++++++++++++------- 5 files changed, 34 insertions(+), 41 deletions(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 2a52d3060e4b9..77c45cf36894b 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -46,6 +46,7 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] + # test is not written to work with string dtype (checks .base) @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_copy(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) @@ -80,6 +81,7 @@ def test_concat_copy(self, using_array_manager, using_copy_on_write): assert arr is df3._mgr.arrays[0] else: assert arr.base is not None + assert arr.base is not None # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 11e29f4e10dc4..77a3d64415ace 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -4,8 +4,6 @@ import pytest import pytz -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -3083,12 +3081,8 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_merge_datatype_error_raises(self, using_infer_string): - if using_infer_string: - msg = "incompatible merge keys" - else: - msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" + def test_merge_datatype_error_raises(self): + msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 2c17b7f6a5a47..324d2a6cfd419 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -216,11 +214,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_dataframe_dummies_string_dtype(self, df, using_infer_string): + def test_dataframe_dummies_string_dtype(self, df, any_string_dtype): # GH44965 df = df[["A", "B"]] - df = df.astype({"A": "object", "B": "string"}) + df = df.astype({"A": "str", "B": any_string_dtype}) result = get_dummies(df) expected = DataFrame( { @@ -231,8 +228,7 @@ def test_dataframe_dummies_string_dtype(self, df, using_infer_string): }, dtype=bool, ) - if not using_infer_string: - # infer_string returns numpy bools + if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA: expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index cbe2c9b931ee3..944e61896a182 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -21,7 +19,7 @@ def df(): res = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) res["id1"] = (res["A"] > 0).astype(np.int64) @@ -83,7 +81,6 @@ def test_default_col_names(self, df): result2 = df.melt(id_vars=["id1", "id2"]) assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_value_vars(self, df): result3 = df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 @@ -100,7 +97,6 @@ def test_value_vars(self, df): ) tm.assert_frame_equal(result4, expected4) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("type_", (tuple, list, np.array)) def test_value_vars_types(self, type_, df): # GH 15348 @@ -181,7 +177,6 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1): with pytest.raises(ValueError, match=msg): df1.melt(id_vars=id_vars, value_vars=value_vars) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_name(self, df, var_name): result5 = df.melt(var_name=var_name) assert result5.columns.tolist() == ["var", "value"] @@ -209,7 +204,6 @@ def test_custom_var_name(self, df, var_name): ) tm.assert_frame_equal(result9, expected9) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_value_name(self, df, value_name): result10 = df.melt(value_name=value_name) assert result10.columns.tolist() == ["variable", "val"] @@ -239,7 +233,6 @@ def test_custom_value_name(self, df, value_name): ) tm.assert_frame_equal(result14, expected14) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_and_value_name(self, df, value_name, var_name): result15 = df.melt(var_name=var_name, value_name=value_name) assert result15.columns.tolist() == ["var", "val"] @@ -364,7 +357,6 @@ def test_melt_missing_columns_raises(self): with pytest.raises(KeyError, match=msg): multi.melt(["A"], ["F"], col_level=0) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_melt_mixed_int_str_id_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) @@ -372,6 +364,8 @@ def test_melt_mixed_int_str_id_vars(self): expected = DataFrame( {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} ) + # the df's columns are mixed type and thus object -> preserves object dtype + expected["variable"] = expected["variable"].astype(object) tm.assert_frame_equal(result, expected) def test_melt_mixed_int_str_value_vars(self): @@ -1205,12 +1199,10 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", ["O", "string"]) - def test_missing_stubname(self, dtype): + def test_missing_stubname(self, any_string_dtype): # GH46044 df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]}) - df = df.astype({"id": dtype}) + df = df.astype({"id": any_string_dtype}) result = wide_to_long( df, stubnames=["a", "b"], @@ -1226,12 +1218,13 @@ def test_missing_stubname(self, dtype): {"a": [100, 200, 300, 400], "b": [np.nan] * 4}, index=index, ) - new_level = expected.index.levels[0].astype(dtype) + new_level = expected.index.levels[0].astype(any_string_dtype) + if any_string_dtype == "object": + new_level = expected.index.levels[0].astype("str") expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wide_to_long_pyarrow_string_columns(): # GH 57066 pytest.importorskip("pyarrow") @@ -1250,7 +1243,7 @@ def test_wide_to_long_pyarrow_string_columns(): ) expected = DataFrame( [[1, 1], [1, 1], [1, 2]], - columns=Index(["D", "R"], dtype=object), + columns=Index(["D", "R"]), index=pd.MultiIndex.from_arrays( [ [1, 1, 1], diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 9aa13d59a586b..d0858a0ea5558 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1081,7 +1081,6 @@ def test_margins_dtype_len(self, data): tm.assert_frame_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) def test_pivot_table_multiindex_only(self, cols): # GH 17038 @@ -1091,7 +1090,7 @@ def test_pivot_table_multiindex_only(self, cols): expected = DataFrame( [[4.0, 5.0, 6.0]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"], dtype=object), + index=Index(["v"], dtype="str" if cols == ("a", "b") else "object"), ) tm.assert_frame_equal(result, expected) @@ -2525,13 +2524,16 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", [object, "string"]) - def test_pivot_integer_bug(self, dtype): - df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) + def test_pivot_integer_bug(self, any_string_dtype): + df = DataFrame( + data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=any_string_dtype + ) result = df.pivot(index=1, columns=0, values=2) - tm.assert_index_equal(result.columns, Index(["A", "B"], name=0, dtype=dtype)) + expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype) + if any_string_dtype == "object": + expected_columns = expected_columns.astype("str") + tm.assert_index_equal(result.columns, expected_columns) def test_pivot_index_none(self): # GH#3962 @@ -2613,7 +2615,9 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() # pylint: disable=missing-kwoa - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2629,7 +2633,9 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2643,7 +2649,9 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) From d1052cf286780f428bcc64bc16856bbf1b6ce09b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 22:21:36 +0200 Subject: [PATCH 270/396] BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array (#59756) * BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array * update --- pandas/_libs/lib.pyx | 19 ++++++++++++++----- pandas/tests/copy_view/test_astype.py | 22 +++++++++++++++++++++- pandas/tests/libs/test_lib.py | 14 ++++++++++++++ 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5d8a04664b0e4..d93099cd79d1b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -736,7 +736,9 @@ cpdef ndarray[object] ensure_string_array( convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. copy : bool, default True - Whether to ensure that a new array is returned. + Whether to ensure that a new array is returned. When True, a new array + is always returned. When False, a new array is only returned when needed + to avoid mutating the input array. skipna : bool, default True Whether or not to coerce nulls to their stringified form (e.g. if False, NaN becomes 'nan'). @@ -765,10 +767,17 @@ cpdef ndarray[object] ensure_string_array( result = np.asarray(arr, dtype="object") - if copy and (result is arr or np.shares_memory(arr, result)): - # GH#54654 - result = result.copy() - elif not copy and result is arr: + if result is arr or np.may_share_memory(arr, result): + # if np.asarray(..) did not make a copy of the input arr, we still need + # to do that to avoid mutating the input array + # GH#54654: share_memory check is needed for rare cases where np.asarray + # returns a new object without making a copy of the actual data + if copy: + result = result.copy() + else: + already_copied = False + elif not copy and not result.flags.writeable: + # Weird edge case where result is a view already_copied = False if issubclass(arr.dtype.type, np.str_): diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index fb82329d5b50d..e0e3f6dc058a4 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -135,7 +135,8 @@ def test_astype_string_and_object_update_original( tm.assert_frame_equal(df2, df_orig) -def test_astype_string_copy_on_pickle_roundrip(): +def test_astype_str_copy_on_pickle_roundrip(): + # TODO(infer_string) this test can be removed after 3.0 (once str is the default) # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) @@ -144,6 +145,25 @@ def test_astype_string_copy_on_pickle_roundrip(): tm.assert_series_equal(base, base_copy) +def test_astype_string_copy_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + +def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter read-only array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy._values.flags.writeable = False + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + def test_astype_dict_dtypes(using_copy_on_write): df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 8583d8bcc052c..17dae1879f3b8 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -283,3 +285,15 @@ def test_no_default_pickle(): # GH#40397 obj = tm.round_trip_pickle(lib.no_default) assert obj is lib.no_default + + +def test_ensure_string_array_copy(): + # ensure the original array is not modified in case of copy=False with + # pickle-roundtripped object dtype array + # https://github.com/pandas-dev/pandas/issues/54654 + arr = np.array(["a", None], dtype=object) + arr = pickle.loads(pickle.dumps(arr)) + result = lib.ensure_string_array(arr, copy=False) + assert not np.shares_memory(arr, result) + assert arr[1] is None + assert result[1] is np.nan From 000ea360960be56b164abbc476e949f746898bdc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 22:34:28 +0200 Subject: [PATCH 271/396] TST (string dtype): remove usage of 'string[pyarrow_numpy]' alias (#59758) --- pandas/conftest.py | 28 +++++++++++++++++++ pandas/tests/apply/test_numba.py | 6 ++-- .../tests/arrays/string_/test_string_arrow.py | 5 ++-- pandas/tests/base/test_misc.py | 4 +-- pandas/tests/frame/indexing/test_indexing.py | 10 ++----- pandas/tests/frame/methods/test_rank.py | 14 +++++----- pandas/tests/frame/test_constructors.py | 7 ++--- pandas/tests/groupby/methods/test_size.py | 13 ++------- .../groupby/methods/test_value_counts.py | 14 ++-------- pandas/tests/groupby/test_groupby.py | 11 ++------ pandas/tests/groupby/test_reductions.py | 5 ++-- .../indexes/base_class/test_constructors.py | 4 +-- .../tests/indexes/base_class/test_reshape.py | 7 ++--- pandas/tests/indexes/object/test_indexing.py | 23 ++++----------- pandas/tests/indexes/test_base.py | 5 ++-- pandas/tests/indexes/test_old_base.py | 5 +++- pandas/tests/interchange/test_impl.py | 8 ++++-- pandas/tests/io/json/test_pandas.py | 8 +++--- .../io/parser/dtypes/test_dtypes_basic.py | 11 +++----- pandas/tests/io/pytables/test_read.py | 5 ++-- pandas/tests/io/test_feather.py | 4 ++- pandas/tests/io/test_orc.py | 4 +-- pandas/tests/io/test_parquet.py | 8 +++--- pandas/tests/io/test_sql.py | 3 +- pandas/tests/reshape/test_get_dummies.py | 22 +++++++-------- pandas/tests/reshape/test_melt.py | 8 +++--- pandas/tests/series/test_logical_ops.py | 3 +- pandas/tests/strings/test_find_replace.py | 2 +- pandas/tests/util/test_shares_memory.py | 6 ++-- 29 files changed, 119 insertions(+), 134 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 433ea7275223d..f957289ea52e8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1228,6 +1228,34 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], +) +def string_dtype_no_object(request): + """ + Parametrized fixture for string dtypes. + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) + """ + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) + + @pytest.fixture( params=[ "string[python]", diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 6bbe5100e8826..83b655f89e247 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -5,6 +5,7 @@ import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, Index, @@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis): def test_numba_vs_python_string_index(): # GH#56189 - pytest.importorskip("pyarrow") df = DataFrame( 1, - index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)), ) func = lambda x: x result = df.apply(func, engine="numba", axis=0) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index e6957feecf4b5..2f3840e92b62a 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises(): arr[[0, 1]] = ["foo", "bar", "baz"] -@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) -def test_pickle_roundtrip(dtype): +@pytest.mark.parametrize("na_value", [pd.NA, np.nan]) +def test_pickle_roundtrip(na_value): # GH 42600 pytest.importorskip("pyarrow") + dtype = StringDtype("pyarrow", na_value=na_value) expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 3e0d8b1afedc0..b42e01c76335c 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -180,9 +180,7 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal( - index.dtype, "string[pyarrow_numpy]" - ): + if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow": msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 7a7586961deca..04dba325f060f 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1955,13 +1955,11 @@ def test_adding_new_conditional_column() -> None: ("dtype", "infer_string"), [ (object, False), - ("string[pyarrow_numpy]", True), + (pd.StringDtype(na_value=np.nan), True), ], ) def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: # https://github.com/pandas-dev/pandas/issues/56204 - pytest.importorskip("pyarrow") - df = DataFrame({"a": [1, 2], "b": [3, 4]}) with pd.option_context("future.infer_string", infer_string): df.loc[df["a"] == 1, "c"] = "1" @@ -1971,16 +1969,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: tm.assert_frame_equal(df, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_new_column_infer_string(): # GH#55366 - pytest.importorskip("pyarrow") df = DataFrame({"x": [1]}) with pd.option_context("future.infer_string", True): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( - {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype=object), + {"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))}, + columns=Index(["x", "y"], dtype="str"), ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index edba971408d04..82722eeb1af72 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -14,6 +14,7 @@ ) from pandas.compat import HAS_PYARROW +import pandas as pd from pandas import ( DataFrame, Index, @@ -509,14 +510,13 @@ def test_rank_mixed_axis_zero(self, data, expected): result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype, exp_dtype", - [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], - ) - def test_rank_string_dtype(self, dtype, exp_dtype): + def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 - pytest.importorskip("pyarrow") - obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") + exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64" + if string_dtype_no_object.storage == "python": + # TODO nullable string[python] should also return nullable Int64 + exp_dtype = "float64" expected = Series([1, 2, None, 3], dtype=exp_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 86d9dc0c7fbdc..f70d36d110625 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2721,8 +2721,7 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2756,8 +2755,7 @@ def test_frame_string_inference(self): def test_frame_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2781,7 +2779,6 @@ def test_frame_string_inference_array_string_dtype(self): def test_frame_string_inference_block_dim(self): # GH#55363 - pytest.importorskip("pyarrow") with pd.option_context("future.infer_string", True): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 5b4c08fc24411..fb834ee2a8799 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -3,8 +3,6 @@ from pandas._config import using_string_dtype -import pandas.util._test_decorators as td - from pandas.core.dtypes.common import is_integer_dtype from pandas import ( @@ -111,16 +109,9 @@ def test_size_series_masked_type_returns_Int64(dtype): @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_size_strings(dtype): +def test_size_strings(any_string_dtype): # GH#55627 + dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index dc986d046ca41..d8c6c7c3fe50c 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -8,8 +8,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, CategoricalIndex, @@ -389,14 +387,6 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -414,9 +404,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, - dtype, + any_string_dtype, using_infer_string, ): + dtype = any_string_dtype education_df = education_df.astype(dtype) education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False @@ -433,6 +424,7 @@ def test_compound( expected["proportion"] = expected_count expected["proportion"] /= expected_group_size if dtype == "string[pyarrow]": + # TODO(nullable) also string[python] should return nullable dtypes expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 015a9db32883b..586ef8a126536 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2832,20 +2832,13 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_by_column_values_with_same_starting_value(dtype): +def test_by_column_values_with_same_starting_value(any_string_dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": Series(["sad", "happy", "happy"], dtype=dtype), + "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index f67051de6e8c7..8e1bbcb43e3f3 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -702,10 +702,9 @@ def test_groupby_min_max_categorical(func): @pytest.mark.parametrize("func", ["min", "max"]) -def test_min_empty_string_dtype(func): +def test_min_empty_string_dtype(func, string_dtype_no_object): # GH#55619 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = string_dtype_no_object df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] result = getattr(df.groupby("a"), func)() expected = DataFrame( diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 338509dd239e6..dcf0165ead6c0 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Index(["a", "b"], dtype=dtype) + expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) tm.assert_index_equal(ser, expected) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 6a544e448ebe1..b1a6c30b52f68 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -59,12 +59,11 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) - def test_insert_none_into_string_numpy(self): + def test_insert_none_into_string_numpy(self, string_dtype_no_object): # GH#55365 - pytest.importorskip("pyarrow") - index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b", "c"], dtype=string_dtype_no_object) result = index.insert(-1, None) - expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 322e6677fe05d..57e5c5e3b6abb 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -7,7 +7,6 @@ NA, is_matching_na, ) -import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -159,14 +158,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: - # TODO(infer_string) parametrize over multiple string dtypes - @pytest.mark.parametrize( - "dtype", - [ - "object", - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], - ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -190,24 +181,22 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype): - index = Index(list("bcdxy"), dtype=dtype) + def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected), dtype=dtype) + expected = Index(list(expected), dtype=any_string_dtype) tm.assert_index_equal(result, expected) - # TODO(infer_string) parametrize over multiple string dtypes - @td.skip_if_no("pyarrow") - def test_slice_locs_negative_step_oob(self): - index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + def test_slice_locs_negative_step_oob(self, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) result = index[-10:5:1] tm.assert_index_equal(result, index) result = index[4:-10:-1] - expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") + expected = Index(list("yxdcb"), dtype=any_string_dtype) tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 813446440eded..3bcc62445f0ac 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -971,10 +971,9 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) - @td.skip_if_no("pyarrow") - def test_isin_arrow_string_null(self): + def test_isin_string_null(self, string_dtype_no_object): # GH#55821 - index = Index(["a", "b"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b"], dtype=string_dtype_no_object) result = index.isin([None]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 37aa01ea046ca..176bf893cafa8 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -301,7 +301,10 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" ) - elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"): + elif ( + isinstance(index.dtype, StringDtype) + and index.dtype.storage == "pyarrow" + ): assert tm.shares_memory(result._values, index._values) else: raise NotImplementedError(index.dtype) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index d1a15dc93f702..b3af8def191ec 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -476,7 +476,7 @@ def test_non_str_names_w_duplicates(): ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), ([True, False, None], "boolean", "bool"), ([True, False, None], "boolean[pyarrow]", "bool"), - (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"), (["much ado", "about", None], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), None], @@ -539,7 +539,11 @@ def test_pandas_nullable_with_missing_values( ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), ([True, False, False], "boolean", "bool"), ([True, False, False], "boolean[pyarrow]", "bool"), - (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"), + ( + ["much ado", "about", "nothing"], + pd.StringDtype(na_value=np.nan), + "large_string", + ), (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index de40441fe25dd..a8608434be5ee 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2139,18 +2139,18 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) out = df.to_json() with pd.option_context("future.infer_string", True): result = read_json(StringIO(out)) + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype="string[pyarrow_numpy]", - index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), - columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), + dtype=dtype, + index=Index(["row 1", "row 2"], dtype=dtype), + columns=Index(["col 1", "col 2"], dtype=dtype), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 800ece5a409e1..bc7b21baaeec5 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -547,8 +547,7 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) data = """a,b x,1 @@ -568,8 +567,6 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) def test_string_inference_object_dtype(all_parsers, dtype): # GH#56047 - pytest.importorskip("pyarrow") - data = """a,b x,a y,a @@ -583,7 +580,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): "a": pd.Series(["x", "y", "z"], dtype=object), "b": pd.Series(["a", "a", "a"], dtype=object), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -593,9 +590,9 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index a04f02f0e052b..28cd8aea1defc 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -403,7 +403,6 @@ def test_read_py2_hdf_file_in_py3(datapath): def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -411,7 +410,7 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype="string[pyarrow_numpy]", - columns=Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 57e12747a3746..24fc801de44a7 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -249,5 +249,7 @@ def test_string_inference(self, tmp_path): df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") + expected = pd.DataFrame( + data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index d2204a9134f90..4c4d7461e4ac5 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -438,7 +438,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 578c0949a6c97..746ca3cf6534d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1128,8 +1128,8 @@ def test_string_inference(self, tmp_path, pa): result = read_parquet(path, engine="pyarrow") expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -1159,8 +1159,8 @@ def test_infer_string_large_string_type(self, tmp_path, pa): result = read_parquet(path) expected = pd.DataFrame( data={"a": [None, "b", "c"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index b1557d71f15e4..514eaceaccbe6 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -3840,7 +3840,6 @@ class Test(BaseModel): def test_read_sql_string_inference(sqlite_engine): conn = sqlite_engine # GH#54430 - pytest.importorskip("pyarrow") table = "test" df = DataFrame({"a": ["x", "y"]}) df.to_sql(table, con=conn, index=False, if_exists="replace") @@ -3848,7 +3847,7 @@ def test_read_sql_string_inference(sqlite_engine): with pd.option_context("future.infer_string", True): result = read_sql_table(table, conn) - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 324d2a6cfd419..637bce59e9e2c 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -707,19 +707,17 @@ def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype): ) tm.assert_frame_equal(result, expected) - @td.skip_if_no("pyarrow") - def test_get_dummies_ea_dtype(self): + @pytest.mark.parametrize("dtype_type", ["string", "category"]) + def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object): # GH#56273 - for dtype, exp_dtype in [ - ("string[pyarrow]", "boolean"), - ("string[pyarrow_numpy]", "bool"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"), - ]: - df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) - result = get_dummies(df) - expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) - tm.assert_frame_equal(result, expected) + dtype = string_dtype_no_object + exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool" + if dtype_type == "category": + dtype = CategoricalDtype(Index(["a"], dtype)) + df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) + tm.assert_frame_equal(result, expected) @td.skip_if_no("pyarrow") def test_get_dummies_arrow_dtype(self): diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 944e61896a182..e58187ba6bcbc 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -1225,9 +1225,9 @@ def test_missing_stubname(self, any_string_dtype): tm.assert_frame_equal(result, expected) -def test_wide_to_long_pyarrow_string_columns(): +def test_wide_to_long_string_columns(string_storage): # GH 57066 - pytest.importorskip("pyarrow") + string_dtype = pd.StringDtype(string_storage, na_value=np.nan) df = DataFrame( { "ID": {0: 1}, @@ -1237,7 +1237,7 @@ def test_wide_to_long_pyarrow_string_columns(): "D": {0: 1}, } ) - df.columns = df.columns.astype("string[pyarrow_numpy]") + df.columns = df.columns.astype(string_dtype) result = wide_to_long( df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" ) @@ -1247,7 +1247,7 @@ def test_wide_to_long_pyarrow_string_columns(): index=pd.MultiIndex.from_arrays( [ [1, 1, 1], - Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"), + Index(["test1", "test2", "test3"], dtype=string_dtype), ], names=["ID", "UNPIVOTED"], ), diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index b9ddfc189edce..26bdfcbc6ec56 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -11,6 +11,7 @@ DataFrame, Index, Series, + StringDtype, bdate_range, ) import pandas._testing as tm @@ -533,7 +534,7 @@ def test_pyarrow_numpy_string_invalid(self): # GH#56008 pa = pytest.importorskip("pyarrow") ser = Series([False, True]) - ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser2 = Series(["a", "b"], dtype=StringDtype(na_value=np.nan)) result = ser == ser2 expected_eq = Series(False, index=ser.index) tm.assert_series_equal(result, expected_eq) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 8c5a9b39157ea..f52872c3d2835 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -26,7 +26,7 @@ def using_pyarrow(dtype): - return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") + return dtype == "string" and dtype.storage == "pyarrow" def test_contains(any_string_dtype): diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py index 00a897d574a07..8f1ac93b40247 100644 --- a/pandas/tests/util/test_shares_memory.py +++ b/pandas/tests/util/test_shares_memory.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas.util._test_decorators as td import pandas as pd @@ -20,10 +22,10 @@ def test_shares_memory_string(): # GH#55823 import pyarrow as pa - obj = pd.array(["a", "b"], dtype="string[pyarrow]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=pd.NA)) assert tm.shares_memory(obj, obj) - obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=np.nan)) assert tm.shares_memory(obj, obj) obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string())) From ed2db3a962d3b9fbd45e2ca1860636df6d6ef25a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 10 Sep 2024 01:18:29 -0700 Subject: [PATCH 272/396] BUG (string): Series.str.slice with negative step (#59724) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.3.0.rst | 3 ++- pandas/core/arrays/_arrow_string_mixins.py | 31 ++++++++++++++++------ pandas/core/arrays/arrow/array.py | 11 -------- pandas/core/arrays/string_arrow.py | 14 +--------- pandas/tests/extension/test_arrow.py | 1 + pandas/tests/strings/test_strings.py | 1 + 6 files changed, 28 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 03355f655eb28..03b3a6b55dff6 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -103,8 +103,9 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) +- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) - +- Interval ^^^^^^^^ diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 4829b175783ed..042747ae7da1c 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -11,6 +11,7 @@ from pandas.compat import ( pa_version_under10p1, + pa_version_under11p0, pa_version_under13p0, pa_version_under17p0, ) @@ -22,16 +23,13 @@ import pyarrow.compute as pc if TYPE_CHECKING: - from collections.abc import ( - Callable, - Sized, - ) + from collections.abc import Callable from pandas._typing import Scalar class ArrowStringArrayMixin: - _pa_array: Sized + _pa_array: pa.ChunkedArray def __init__(self, *args, **kwargs) -> None: raise NotImplementedError @@ -93,12 +91,29 @@ def _str_get(self, i: int): selected = pc.utf8_slice_codeunits( self._pa_array, start=start, stop=stop, step=step ) - null_value = pa.scalar( - None, type=self._pa_array.type # type: ignore[attr-defined] - ) + null_value = pa.scalar(None, type=self._pa_array.type) result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if pa_version_under11p0: + # GH#59724 + result = self._apply_elementwise(lambda val: val[start:stop:step]) + return type(self)(pa.chunked_array(result, type=self._pa_array.type)) + if start is None: + if step is not None and step < 0: + # GH#59710 + start = -1 + else: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_slice_replace( self, start: int | None = None, stop: int | None = None, repl: str | None = None ): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 861ec0c42c885..764213de87593 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2368,17 +2368,6 @@ def _str_rpartition(self, sep: str, expand: bool): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - def _str_len(self): return type(self)(pc.utf8_length(self._pa_array)) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 5ed12e7352bd1..a7a661e8c0cb8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -295,6 +295,7 @@ def _data(self): _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith _str_pad = ArrowStringArrayMixin._str_pad + _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -351,19 +352,6 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if stop is None: - return super()._str_slice(start, stop, step) - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - def _str_len(self): result = pc.utf8_length(self._pa_array) return self._convert_int_result(result) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 12f3eedb6b9f1..d0ec87905aa87 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2018,6 +2018,7 @@ def test_str_join_string_type(): [None, 2, None, ["ab", None]], [None, 2, 1, ["ab", None]], [1, 3, 1, ["bc", None]], + (None, None, -1, ["dcba", None]), ], ) def test_str_slice(start, stop, step, exp): diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 015df18221b40..40b6c69dc8025 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -393,6 +393,7 @@ def test_pipe_failures(any_string_dtype): (2, 5, None, ["foo", "bar", np.nan, "baz"]), (0, 3, -1, ["", "", np.nan, ""]), (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]), + (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]), (3, 10, 2, ["oto", "ato", np.nan, "aqx"]), (3, 0, -1, ["ofa", "aba", np.nan, "aba"]), ], From d3aa02dd94ffa4164c434576c414f0c58c33e41e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Sep 2024 16:35:18 +0200 Subject: [PATCH 273/396] String dtype: remove fallback Perfomance warnings for string methods (#59760) --- pandas/core/arrays/arrow/_arrow_utils.py | 16 ----- pandas/core/arrays/string_arrow.py | 4 -- pandas/tests/extension/test_string.py | 1 - pandas/tests/indexes/test_setops.py | 12 ---- pandas/tests/strings/test_find_replace.py | 72 ++++++++--------------- pandas/tests/strings/test_string_array.py | 1 - 6 files changed, 23 insertions(+), 83 deletions(-) diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index 2a053fac2985c..285c3fd465ffc 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -1,24 +1,8 @@ from __future__ import annotations -import warnings - import numpy as np import pyarrow -from pandas.errors import PerformanceWarning -from pandas.util._exceptions import find_stack_level - - -def fallback_performancewarning(version: str | None = None) -> None: - """ - Raise a PerformanceWarning for falling back to ExtensionArray's - non-pyarrow method - """ - msg = "Falling back on a non-pyarrow code path which may decrease performance." - if version is not None: - msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning." - warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) - def pyarrow_array_to_numpy_and_mask( arr, dtype: np.dtype diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a7a661e8c0cb8..1591253b01345 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -42,8 +42,6 @@ import pyarrow as pa import pyarrow.compute as pc - from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning - if TYPE_CHECKING: from collections.abc import Sequence @@ -301,7 +299,6 @@ def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True ): if flags: - fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) if not isna(na): @@ -327,7 +324,6 @@ def _str_replace( regex: bool = True, ): if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - fallback_performancewarning() return super()._str_replace(pat, repl, n, case, flags, regex) return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 7f04858318013..354b4d5333c7d 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -212,7 +212,6 @@ def test_compare_scalar(self, data, comparison_op): ser = pd.Series(data) self._compare_other(ser, data, comparison_op, "abc") - @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning") def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 4a6982cf98670..72c3396f124b8 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -240,9 +240,6 @@ def test_intersection_base(self, index): with pytest.raises(TypeError, match=msg): first.intersection([1, 2, 3]) - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_union_base(self, index): index = index.unique() @@ -270,9 +267,6 @@ def test_union_base(self, index): first.union([1, 2, 3]) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) def test_difference_base(self, sort, index): first = index[2:] second = index[:4] @@ -299,9 +293,6 @@ def test_difference_base(self, sort, index): first.difference([1, 2, 3], sort) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) def test_symmetric_difference(self, index): if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") @@ -523,9 +514,6 @@ def test_intersection_difference_match_empty(self, index, sort): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -@pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" -) @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] ) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index f52872c3d2835..2742c5b67e57e 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -4,10 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW -from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td import pandas as pd @@ -25,10 +21,6 @@ # -------------------------------------------------------------------------------------- -def using_pyarrow(dtype): - return dtype == "string" and dtype.storage == "pyarrow" - - def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -281,10 +273,13 @@ def test_contains_nan(any_string_dtype): # -------------------------------------------------------------------------------------- -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) -def test_startswith_endswith_validate_na(any_string_dtype): +def test_startswith_endswith_validate_na(request, any_string_dtype): + if ( + any_string_dtype == "string" + and any_string_dtype.na_value is np.nan + and any_string_dtype.storage == "python" + ): + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) # GH#59615 ser = Series( ["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"], @@ -462,8 +457,7 @@ def test_replace_mixed_object(): def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) + result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -483,8 +477,7 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) + result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -502,10 +495,7 @@ def test_replace_callable_raises(any_string_dtype, repl): r"(?(3)required )positional arguments?" ) with pytest.raises(TypeError, match=msg): - with tm.maybe_produces_warning( - PerformanceWarning, using_pyarrow(any_string_dtype) - ): - values.str.replace("a", repl, regex=True) + values.str.replace("a", repl, regex=True) def test_replace_callable_named_groups(any_string_dtype): @@ -513,8 +503,7 @@ def test_replace_callable_named_groups(any_string_dtype): ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, repl, regex=True) + result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -525,13 +514,11 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, "", regex=True) + result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, "", n=1, regex=True) + result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -552,8 +539,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, ", ", regex=True) + result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -580,8 +566,7 @@ def test_replace_compiled_regex_callable(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace(pat, repl, n=2, regex=True) + result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -629,8 +614,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("A", "YYY", case=False) + result = ser.str.replace("A", "YYY", case=False) expected = Series( [ "YYY", @@ -648,8 +632,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) + result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ "A", @@ -672,13 +655,11 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("a", "c", case=False, regex=False) + result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.replace("a.", "c.", case=False, regex=False) + result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -850,8 +831,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): - result = ser.str.fullmatch("ab", flags=re.IGNORECASE) + result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -1036,17 +1016,13 @@ def test_flags_kwarg(any_string_dtype): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - use_pyarrow = using_pyarrow(any_string_dtype) - result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): - result = data.str.match(pat, flags=re.IGNORECASE) + result = data.str.match(pat, flags=re.IGNORECASE) assert result.iloc[0] - with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): - result = data.str.fullmatch(pat, flags=re.IGNORECASE) + result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result.iloc[0] result = data.str.findall(pat, flags=re.IGNORECASE) @@ -1056,8 +1032,6 @@ def test_flags_kwarg(any_string_dtype): assert result.iloc[0] == 1 msg = "has match groups" - with tm.assert_produces_warning( - UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow - ): + with tm.assert_produces_warning(UserWarning, match=msg): result = data.str.contains(pat, flags=re.IGNORECASE) assert result.iloc[0] diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 0b3f368afea5e..517ddb164985c 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -12,7 +12,6 @@ ) -@pytest.mark.filterwarnings("ignore:Falling back") def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method From 2f1caf58326043578ecbbba153302bb77092913d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 11 Sep 2024 12:40:01 -0700 Subject: [PATCH 274/396] REF (string): de-duplicate ArrowStringArray methods (#59555) --- pandas/core/arrays/_arrow_string_mixins.py | 90 ++++++++++++++++- pandas/core/arrays/arrow/array.py | 86 +---------------- pandas/core/arrays/string_arrow.py | 106 ++++----------------- 3 files changed, 108 insertions(+), 174 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 042747ae7da1c..a39668faf779e 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import partial +import re from typing import ( TYPE_CHECKING, Any, @@ -25,7 +26,10 @@ if TYPE_CHECKING: from collections.abc import Callable - from pandas._typing import Scalar + from pandas._typing import ( + Scalar, + Self, + ) class ArrowStringArrayMixin: @@ -45,6 +49,37 @@ def _convert_int_result(self, result): def _apply_elementwise(self, func: Callable) -> list[list[Any]]: raise NotImplementedError + def _str_len(self): + result = pc.utf8_length(self._pa_array) + return self._convert_int_result(result) + + def _str_lower(self) -> Self: + return type(self)(pc.utf8_lower(self._pa_array)) + + def _str_upper(self) -> Self: + return type(self)(pc.utf8_upper(self._pa_array)) + + def _str_strip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_trim_whitespace(self._pa_array) + else: + result = pc.utf8_trim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_lstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_ltrim_whitespace(self._pa_array) + else: + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_rstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_rtrim_whitespace(self._pa_array) + else: + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) + return type(self)(result) + def _str_pad( self, width: int, @@ -125,7 +160,34 @@ def _str_slice_replace( stop = np.iinfo(np.int64).max return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) - def _str_capitalize(self): + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ) -> Self: + if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + raise NotImplementedError( + "replace is not supported with a re.Pattern, callable repl, " + "case=False, or flags!=0" + ) + + func = pc.replace_substring_regex if regex else pc.replace_substring + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) + return type(self)(result) + + def _str_capitalize(self) -> Self: return type(self)(pc.utf8_capitalize(self._pa_array)) def _str_title(self): @@ -134,6 +196,16 @@ def _str_title(self): def _str_swapcase(self): return type(self)(pc.utf8_swapcase(self._pa_array)) + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + predicate = lambda val: val.removeprefix(prefix) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + def _str_removesuffix(self, suffix: str): ends_with = pc.ends_with(self._pa_array, pattern=suffix) removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) @@ -225,6 +297,20 @@ def _str_contains( result = result.fill_null(na) return self._convert_bool_result(result) + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.startswith("^"): + pat = f"^{pat}" + return self._str_contains(pat, case, flags, na, regex=True) + + def _str_fullmatch( + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.endswith("$") or pat.endswith("\\$"): + pat = f"{pat}$" + return self._str_match(pat, case, flags, na) + def _str_find(self, sub: str, start: int = 0, end: int | None = None): if ( pa_version_under13p0 diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 764213de87593..56f38cc4f5361 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1989,7 +1989,7 @@ def _rank( """ See Series.rank.__doc__. """ - return type(self)( + return self._convert_int_result( self._rank_calc( axis=axis, method=method, @@ -2296,36 +2296,6 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _result_converter(self, result): - return type(self)(result) - - def _str_replace( - self, - pat: str | re.Pattern, - repl: str | Callable, - n: int = -1, - case: bool = True, - flags: int = 0, - regex: bool = True, - ): - if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - raise NotImplementedError( - "replace is not supported with a re.Pattern, callable repl, " - "case=False, or flags!=0" - ) - - func = pc.replace_substring_regex if regex else pc.replace_substring - # https://github.com/apache/arrow/issues/39149 - # GH 56404, unexpected behavior with negative max_replacements with pyarrow. - pa_max_replacements = None if n < 0 else n - result = func( - self._pa_array, - pattern=pat, - replacement=repl, - max_replacements=pa_max_replacements, - ) - return type(self)(result) - def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): raise NotImplementedError( @@ -2334,20 +2304,6 @@ def _str_repeat(self, repeats: int | Sequence[int]): else: return type(self)(pc.binary_repeat(self._pa_array, repeats)) - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - def _str_join(self, sep: str): if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type @@ -2368,46 +2324,6 @@ def _str_rpartition(self, sep: str, expand: bool): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_len(self): - return type(self)(pc.utf8_length(self._pa_array)) - - def _str_lower(self): - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self): - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_removeprefix(self, prefix: str): - if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) - predicate = lambda val: val.removeprefix(prefix) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - def _str_casefold(self): predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1591253b01345..80651dcdaebe1 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -48,9 +48,7 @@ from pandas._typing import ( ArrayLike, - AxisInt, Dtype, - Scalar, npt, ) @@ -293,6 +291,20 @@ def _data(self): _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith _str_pad = ArrowStringArrayMixin._str_pad + _str_match = ArrowStringArrayMixin._str_match + _str_fullmatch = ArrowStringArrayMixin._str_fullmatch + _str_lower = ArrowStringArrayMixin._str_lower + _str_upper = ArrowStringArrayMixin._str_upper + _str_strip = ArrowStringArrayMixin._str_strip + _str_lstrip = ArrowStringArrayMixin._str_lstrip + _str_rstrip = ArrowStringArrayMixin._str_rstrip + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_get = ArrowStringArrayMixin._str_get + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace + _str_len = ArrowStringArrayMixin._str_len _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( @@ -326,73 +338,21 @@ def _str_replace( if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: return super()._str_replace(pat, repl, n, case, flags, regex) - return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex) + return ArrowStringArrayMixin._str_replace( + self, pat, repl, n, case, flags, regex + ) def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): return super()._str_repeat(repeats) else: - return type(self)(pc.binary_repeat(self._pa_array, repeats)) - - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - - def _str_len(self): - result = pc.utf8_length(self._pa_array) - return self._convert_int_result(result) - - def _str_lower(self): - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self): - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None): - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) + return ArrowExtensionArray._str_repeat(self, repeats=repeats) def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) + return ArrowStringArrayMixin._str_removeprefix(self, prefix) return super()._str_removeprefix(prefix) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) @@ -449,28 +409,6 @@ def _reduce( else: return result - def _rank( - self, - *, - axis: AxisInt = 0, - method: str = "average", - na_option: str = "keep", - ascending: bool = True, - pct: bool = False, - ): - """ - See Series.rank.__doc__. - """ - return self._convert_int_result( - self._rank_calc( - axis=axis, - method=method, - na_option=na_option, - ascending=ascending, - pct=pct, - ) - ) - def value_counts(self, dropna: bool = True) -> Series: result = super().value_counts(dropna=dropna) if self.dtype.na_value is np.nan: @@ -492,9 +430,3 @@ def _cmp_method(self, other, op): class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan - _str_get = ArrowStringArrayMixin._str_get - _str_removesuffix = ArrowStringArrayMixin._str_removesuffix - _str_capitalize = ArrowStringArrayMixin._str_capitalize - _str_title = ArrowStringArrayMixin._str_title - _str_swapcase = ArrowStringArrayMixin._str_swapcase - _str_slice_replace = ArrowStringArrayMixin._str_slice_replace From ea1e11800d3eeea95bcd5d6dc7f41bc3fd8d5d7a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Sep 2024 23:08:34 +0200 Subject: [PATCH 275/396] BUG/API (string dtype): return float dtype for series[str].rank() (#59768) * BUG/API (string dtype): return float dtype for series[str].rank() * update frame tests * add whatsnew * correct whatsnew note --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/arrays/arrow/array.py | 5 +- pandas/core/arrays/string_arrow.py | 11 ++++ pandas/tests/frame/methods/test_rank.py | 23 ++------ pandas/tests/series/methods/test_rank.py | 72 ++++++++++++++++++------ 5 files changed, 76 insertions(+), 36 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 03b3a6b55dff6..01c2ed3821d7a 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -102,6 +102,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 56f38cc4f5361..e0ccbd6fdc5fd 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1989,7 +1989,7 @@ def _rank( """ See Series.rank.__doc__. """ - return self._convert_int_result( + return self._convert_rank_result( self._rank_calc( axis=axis, method=method, @@ -2291,6 +2291,9 @@ def _convert_bool_result(self, result): def _convert_int_result(self, result): return type(self)(result) + def _convert_rank_result(self, result): + return type(self)(result) + def _str_count(self, pat: str, flags: int = 0): if flags: raise NotImplementedError(f"count not implemented with {flags=}") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 80651dcdaebe1..56f7d3aecce20 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -30,6 +30,7 @@ from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import Float64Dtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype from pandas.core.arrays.string_ import ( @@ -388,6 +389,16 @@ def _convert_int_result(self, result): return Int64Dtype().__from_arrow__(result) + def _convert_rank_result(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + return result.astype("float64", copy=False) + + return Float64Dtype().__from_arrow__(result) + def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 82722eeb1af72..37bed2da05743 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -6,15 +6,11 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.algos import ( Infinity, NegInfinity, ) -from pandas.compat import HAS_PYARROW -import pandas as pd from pandas import ( DataFrame, Index, @@ -474,23 +470,10 @@ def test_rank_inf_nans_na_option( ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first( - self, - request, - frame_or_series, - na_option, - ascending, - expected, - using_infer_string, - ): + def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): obj = frame_or_series(["foo", "foo", None, "foo"]) - if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series): - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) - result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) - if using_infer_string and isinstance(obj, Series): - expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -514,7 +497,9 @@ def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") - exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64" + exp_dtype = ( + "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64" + ) if string_dtype_no_object.storage == "python": # TODO nullable string[python] should also return nullable Int64 exp_dtype = "float64" diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 24cf97c05c0a8..f0fe1d989941e 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -33,7 +33,8 @@ def ser(): ["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])], ["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])], ["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])], - ] + ], + ids=lambda x: x[0], ) def results(request): return request.param @@ -48,12 +49,29 @@ def results(request): "Int64", pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + "string[python]", + "str", ] ) def dtype(request): return request.param +def expected_dtype(dtype, method, pct=False): + exp_dtype = "float64" + # elif dtype in ["Int64", "Float64", "string[pyarrow]", "string[python]"]: + if dtype in ["string[pyarrow]"]: + exp_dtype = "Float64" + elif dtype in ["float64[pyarrow]", "int64[pyarrow]"]: + if method == "average" or pct: + exp_dtype = "double[pyarrow]" + else: + exp_dtype = "uint64[pyarrow]" + + return exp_dtype + + class TestSeriesRank: def test_rank(self, datetime_series): sp_stats = pytest.importorskip("scipy.stats") @@ -241,12 +259,14 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - @pytest.mark.parametrize("dtype", [None, object]) - def test_rank_tie_methods(self, ser, results, dtype): + def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): method, exp = results + if dtype == "int64" or (not using_infer_string and dtype == "str"): + pytest.skip("int64/str does not support NaN") + ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) - tm.assert_series_equal(result, Series(exp)) + tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) @@ -346,25 +366,35 @@ def test_rank_methods_series(self, method, op, value): ], ) def test_rank_dense_method(self, dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense") - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense")) tm.assert_series_equal(result, expected) - def test_rank_descending(self, ser, results, dtype): + def test_rank_descending(self, ser, results, dtype, using_infer_string): method, _ = results - if "i" in dtype: + if dtype == "int64" or (not using_infer_string and dtype == "str"): s = ser.dropna() else: s = ser.astype(dtype) res = s.rank(ascending=False) - expected = (s.max() - s).rank() - tm.assert_series_equal(res, expected) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank() + else: + expected = (s.max() - s).rank() + tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average"))) - expected = (s.max() - s).rank(method=method) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank( + method=method + ) + else: + expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) - tm.assert_series_equal(res2, expected) + tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method))) def test_rank_int(self, ser, results): method, exp = results @@ -421,9 +451,11 @@ def test_rank_ea_small_values(self): ], ) def test_rank_dense_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense", pct=True)) tm.assert_series_equal(result, expected) @@ -442,9 +474,11 @@ def test_rank_dense_pct(dtype, ser, exp): ], ) def test_rank_min_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="min", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "min", pct=True)) tm.assert_series_equal(result, expected) @@ -463,9 +497,11 @@ def test_rank_min_pct(dtype, ser, exp): ], ) def test_rank_max_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="max", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "max", pct=True)) tm.assert_series_equal(result, expected) @@ -484,9 +520,11 @@ def test_rank_max_pct(dtype, ser, exp): ], ) def test_rank_average_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="average", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "average", pct=True)) tm.assert_series_equal(result, expected) @@ -505,9 +543,11 @@ def test_rank_average_pct(dtype, ser, exp): ], ) def test_rank_first_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="first", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "first", pct=True)) tm.assert_series_equal(result, expected) From ee600b11931414a9965a5a6b5700d898c0f22ac0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Sep 2024 23:11:52 +0200 Subject: [PATCH 276/396] String dtype: fix isin() values handling for python storage (#59759) * String dtype: fix isin() values handling for python storage * address feedback --- pandas/conftest.py | 9 ++++- pandas/core/arrays/string_.py | 20 +++++++++++ pandas/tests/arrays/string_/test_string.py | 41 +++++++++++++++++++--- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index f957289ea52e8..c6237d0309630 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1294,7 +1294,13 @@ def string_storage(request): pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), ("python", np.nan), - ] + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], ) def string_dtype_arguments(request): """ @@ -1325,6 +1331,7 @@ def dtype_backend(request): # Alias so we can test with cartesian product of string_storage string_storage2 = string_storage +string_dtype_arguments2 = string_dtype_arguments @pytest.fixture(params=tm.BYTES_DTYPES) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 43c46a4308f9e..0b0fffcb928a3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -46,6 +46,7 @@ nanops, ops, ) +from pandas.core.algorithms import isin from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import ( @@ -65,6 +66,7 @@ import pyarrow from pandas._typing import ( + ArrayLike, AxisInt, Dtype, DtypeObj, @@ -733,6 +735,24 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + if isinstance(values, BaseStringArray) or ( + isinstance(values, ExtensionArray) and is_string_dtype(values.dtype) + ): + values = values.astype(self.dtype, copy=False) + else: + if not lib.is_string_array(np.asarray(values), skipna=True): + values = np.array( + [val for val in values if isinstance(val, str) or isna(val)], + dtype=object, + ) + if not len(values): + return np.zeros(self.shape, dtype=bool) + + values = self._from_sequence(values, dtype=self.dtype) + + return isin(np.asarray(self), np.asarray(values)) + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index d3a0897f88f61..265b9fc40629b 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -29,6 +29,12 @@ def dtype(string_dtype_arguments): return pd.StringDtype(storage=storage, na_value=na_value) +@pytest.fixture +def dtype2(string_dtype_arguments2): + storage, na_value = string_dtype_arguments2 + return pd.StringDtype(storage=storage, na_value=na_value) + + @pytest.fixture def cls(dtype): """Fixture giving array type from parametrized 'dtype'""" @@ -689,11 +695,7 @@ def test_isin(dtype, fixed_now_ts): tm.assert_series_equal(result, expected) result = s.isin(["a", pd.NA]) - if dtype.storage == "python" and dtype.na_value is np.nan: - # TODO(infer_string) we should make this consistent - expected = pd.Series([True, False, False]) - else: - expected = pd.Series([True, False, True]) + expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) result = s.isin([]) @@ -704,6 +706,35 @@ def test_isin(dtype, fixed_now_ts): expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) + result = s.isin([fixed_now_ts]) + expected = pd.Series([False, False, False]) + tm.assert_series_equal(result, expected) + + +def test_isin_string_array(dtype, dtype2): + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=dtype2)) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=dtype2)) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + + +def test_isin_arrow_string_array(dtype): + pa = pytest.importorskip("pyarrow") + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + def test_setitem_scalar_with_mask_validation(dtype): # https://github.com/pandas-dev/pandas/issues/47628 From a790592df19acf341102c60f8221eeb5ae1dc051 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Sep 2024 19:25:59 +0200 Subject: [PATCH 277/396] String dtype: allow string dtype in query/eval with default numexpr engine (#59810) String dtype: allow string dtype in query/eval with default mumexpr engine --- pandas/core/computation/eval.py | 12 +++++++++--- pandas/core/computation/expr.py | 6 +++++- pandas/tests/frame/test_query_eval.py | 21 ++++++--------------- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index f1fe528de06f8..7bb623cba3755 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -10,7 +10,10 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.common import ( + is_extension_array_dtype, + is_string_dtype, +) from pandas.core.computation.engines import ENGINES from pandas.core.computation.expr import ( @@ -336,10 +339,13 @@ def eval( parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) if engine == "numexpr" and ( - is_extension_array_dtype(parsed_expr.terms.return_type) + ( + is_extension_array_dtype(parsed_expr.terms.return_type) + and not is_string_dtype(parsed_expr.terms.return_type) + ) or getattr(parsed_expr.terms, "operand_types", None) is not None and any( - is_extension_array_dtype(elem) + (is_extension_array_dtype(elem) and not is_string_dtype(elem)) for elem in parsed_expr.terms.operand_types ) ): diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index d642c37cea129..34055d2177626 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -20,6 +20,8 @@ from pandas.errors import UndefinedVariableError +from pandas.core.dtypes.common import is_string_dtype + import pandas.core.common as com from pandas.core.computation.ops import ( ARITH_OPS_SYMS, @@ -520,10 +522,12 @@ def _maybe_evaluate_binop( elif self.engine != "pytables": if ( getattr(lhs, "return_type", None) == object + or is_string_dtype(getattr(lhs, "return_type", None)) or getattr(rhs, "return_type", None) == object + or is_string_dtype(getattr(rhs, "return_type", None)) ): # evaluate "==" and "!=" in python if either of our operands - # has an object return type + # has an object or string return type return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) return res diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 7dde0683aa960..27848e4d18596 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( NumExprClobberingError, UndefinedVariableError, @@ -747,7 +745,6 @@ def test_inf(self, op, f, engine, parser): result = df.query(q, engine=engine, parser=parser) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_check_tz_aware_index_query(self, tz_aware_fixture): # https://github.com/pandas-dev/pandas/issues/29463 tz = tz_aware_fixture @@ -760,6 +757,7 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture): tm.assert_frame_equal(result, expected) expected = DataFrame(df_index) + expected.columns = expected.columns.astype(object) result = df.reset_index().query('"2018-01-03 00:00:00+00" < time') tm.assert_frame_equal(result, expected) @@ -1057,7 +1055,7 @@ def test_query_with_string_columns(self, parser, engine): with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) - def test_object_array_eq_ne(self, parser, engine, using_infer_string): + def test_object_array_eq_ne(self, parser, engine): df = DataFrame( { "a": list("aaaabbbbcccc"), @@ -1066,14 +1064,11 @@ def test_object_array_eq_ne(self, parser, engine, using_infer_string): "d": np.random.default_rng(2).integers(9, size=12), } ) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query("a == b", parser=parser, engine=engine) + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] tm.assert_frame_equal(res, exp) - with tm.assert_produces_warning(warning): - res = df.query("a != b", parser=parser, engine=engine) + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] tm.assert_frame_equal(res, exp) @@ -1112,16 +1107,12 @@ def test_query_with_nested_special_character(self, parser, engine): [">=", operator.ge], ], ) - def test_query_lex_compare_strings( - self, parser, engine, op, func, using_infer_string - ): + def test_query_lex_compare_strings(self, parser, engine, op, func): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query(f'X {op} "d"', engine=engine, parser=parser) + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) From dc4399cfec86dfcc8200a16220c199bc2f65544a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 Sep 2024 19:57:49 +0200 Subject: [PATCH 278/396] String dtype: map builtin str alias to StringDtype (#59685) * String dtype: map builtin str alias to StringDtype * fix tests * fix datetimelike astype and more tests * remove xfails * try fix typing * fix copy_view tests * fix remaining tests with infer_string enabled * ignore typing issue for now * move to common.py * simplify Categorical._str_get_dummies * small cleanup * fix ensure_string_array to not modify extension arrays inplace * fix ensure_string_array once more + fix is_extension_array_dtype for str * still xfail TestArrowArray::test_astype_str when not using infer_string * ensure maybe_convert_objects copies object dtype input array when inferring StringDtype * update test_1d_object_array_does_not_copy test * update constructor copy test + do not copy in maybe_convert_objects? * skip str.get_dummies test for now * use pandas_dtype() instead of registry.find * fix corner cases for calling pandas_dtype * add TODO comment in ensure_string_array --- pandas/_libs/lib.pyx | 9 +++- pandas/_testing/__init__.py | 2 +- pandas/core/arrays/categorical.py | 4 +- pandas/core/arrays/datetimelike.py | 10 ++++- pandas/core/dtypes/common.py | 18 +++++++- pandas/core/indexes/base.py | 6 ++- pandas/core/indexes/interval.py | 3 +- pandas/tests/arrays/floating/test_astype.py | 6 +-- pandas/tests/arrays/integer/test_dtypes.py | 6 +-- pandas/tests/arrays/sparse/test_astype.py | 4 +- pandas/tests/arrays/sparse/test_dtype.py | 2 +- pandas/tests/dtypes/test_common.py | 12 ++++++ pandas/tests/extension/base/casting.py | 4 +- pandas/tests/extension/json/array.py | 3 +- pandas/tests/extension/test_arrow.py | 29 +++---------- pandas/tests/frame/methods/test_astype.py | 17 ++++---- .../tests/frame/methods/test_select_dtypes.py | 5 ++- pandas/tests/frame/test_constructors.py | 41 +++++++++++++++---- .../indexes/datetimes/methods/test_astype.py | 15 ++++--- pandas/tests/indexes/object/test_astype.py | 4 +- .../indexes/period/methods/test_astype.py | 9 +++- .../indexes/timedeltas/methods/test_astype.py | 9 +++- pandas/tests/interchange/test_impl.py | 1 + pandas/tests/io/excel/test_readers.py | 8 ++-- .../io/parser/dtypes/test_dtypes_basic.py | 17 ++++---- pandas/tests/io/parser/test_na_values.py | 2 - .../io/parser/test_python_parser_only.py | 6 +-- pandas/tests/series/methods/test_astype.py | 30 ++++++++------ pandas/tests/series/methods/test_map.py | 4 +- pandas/tests/series/test_constructors.py | 2 +- pandas/tests/test_algos.py | 7 +++- 31 files changed, 183 insertions(+), 112 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d93099cd79d1b..c23f907aecfab 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -755,7 +755,14 @@ cpdef ndarray[object] ensure_string_array( if hasattr(arr, "to_numpy"): - if hasattr(arr, "dtype") and arr.dtype.kind in "mM": + if ( + hasattr(arr, "dtype") + and arr.dtype.kind in "mM" + # TODO: we should add a custom ArrowExtensionArray.astype implementation + # that handles astype(str) specifically, avoiding ending up here and + # then we can remove the below check for `_pa_array` (for ArrowEA) + and not hasattr(arr, "_pa_array") + ): # dtype check to exclude DataFrame # GH#41409 TODO: not a great place for this out = arr.astype(str).astype(object) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 10c1c490551fb..3aa7c64831efe 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -112,7 +112,7 @@ COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] if using_string_dtype(): - STRING_DTYPES: list[Dtype] = [str, "U"] + STRING_DTYPES: list[Dtype] = ["U"] else: STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6ffc0df243130..97004474648b2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2691,7 +2691,9 @@ def _str_get_dummies(self, sep: str = "|"): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) + return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies( + sep + ) # ------------------------------------------------------------------------ # GroupBy Methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e85c0222bbec3..81e2f04f2ba2e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -472,10 +472,16 @@ def astype(self, dtype, copy: bool = True): return self._box_values(self.asi8.ravel()).reshape(self.shape) + elif is_string_dtype(dtype): + if isinstance(dtype, ExtensionDtype): + arr_object = self._format_native_types(na_rep=dtype.na_value) # type: ignore[arg-type] + cls = dtype.construct_array_type() + return cls._from_sequence(arr_object, dtype=dtype, copy=False) + else: + return self._format_native_types() + elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) - elif is_string_dtype(dtype): - return self._format_native_types() elif dtype.kind in "iu": # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index df0251d141984..fe705daaad5fa 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -12,6 +12,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( Interval, Period, @@ -1325,7 +1327,15 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: elif isinstance(dtype, np.dtype): return False else: - return registry.find(dtype) is not None + try: + with warnings.catch_warnings(): + # pandas_dtype(..) can raise UserWarning for class input + warnings.simplefilter("ignore", UserWarning) + dtype = pandas_dtype(dtype) + except (TypeError, ValueError): + # np.dtype(..) can raise ValueError + return False + return isinstance(dtype, ExtensionDtype) def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool: @@ -1620,6 +1630,12 @@ def pandas_dtype(dtype) -> DtypeObj: elif isinstance(dtype, (np.dtype, ExtensionDtype)): return dtype + # builtin aliases + if dtype is str and using_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + return StringDtype(na_value=np.nan) + # registered extension types result = registry.find(dtype) if result is not None: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a28c98ecc5cee..8e8eb768130fd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6415,7 +6415,11 @@ def _should_compare(self, other: Index) -> bool: return False dtype = _unpack_nested_dtype(other) - return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) + return ( + self._is_comparable_dtype(dtype) + or is_object_dtype(dtype) + or is_string_dtype(dtype) + ) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 4fcdb87974511..635924674d9f4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -50,6 +50,7 @@ is_number, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -699,7 +700,7 @@ def _get_indexer( # left/right get_indexer, compare elementwise, equality -> match indexer = self._get_indexer_unique_sides(target) - elif not is_object_dtype(target.dtype): + elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)): # homogeneous scalar index: use IntervalTree # we should always have self._should_partial_index(target) here target = self._maybe_convert_i8(target) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ccf644b34051d..752ebe194ffcf 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -68,11 +68,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["0.1", "0.2", ""], dtype="U32") diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 7be00e569b3fe..90879d8bd3063 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -283,11 +283,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py index 83a507e679d46..e6e4a11a0f5ab 100644 --- a/pandas/tests/arrays/sparse/test_astype.py +++ b/pandas/tests/arrays/sparse/test_astype.py @@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype): ), ( SparseArray([0, 1, 10]), - str, - SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + np.str_, + SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")), ), (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), ( diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 234f4092421e5..149c28341ba3d 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -177,7 +177,7 @@ def test_construct_from_string_fill_value_raises(string): [ (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), - (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(int, 1), np.str_, SparseDtype(object, "1")), (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), ], ) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ccd30caba5dee..f7442cf5d6d3c 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -810,11 +810,23 @@ def test_pandas_dtype_string_dtypes(string_storage): "pyarrow" if HAS_PYARROW else "python", na_value=np.nan ) + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype(str) + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + with pd.option_context("future.infer_string", True): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype(str) + assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", False): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 2bfe801c48a77..56879129c3a28 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -43,8 +43,8 @@ def test_tolist(self, data): assert result == expected def test_astype_str(self, data): - result = pd.Series(data[:5]).astype(str) - expected = pd.Series([str(x) for x in data[:5]], dtype=str) + result = pd.Series(data[:2]).astype(str) + expected = pd.Series([str(x) for x in data[:2]], dtype=str) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index e43b50322bb92..5cbd45a99ae5c 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -207,9 +207,8 @@ def astype(self, dtype, copy=True): return self.copy() return self elif isinstance(dtype, StringDtype): - value = self.astype(str) # numpy doesn't like nested dicts arr_cls = dtype.construct_array_type() - return arr_cls._from_sequence(value, dtype=dtype, copy=False) + return arr_cls._from_sequence(self, dtype=dtype, copy=False) elif not copy: return np.asarray([dict(x) for x in self], dtype=dtype) else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d0ec87905aa87..60e7bd83432c5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -41,7 +41,6 @@ pa_version_under13p0, pa_version_under14p0, ) -import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -286,7 +285,7 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) - def test_astype_str(self, data, request): + def test_astype_str(self, data, request, using_infer_string): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): request.applymarker( @@ -294,9 +293,10 @@ def test_astype_str(self, data, request): reason=f"For {pa_dtype} .astype(str) decodes.", ) ) - elif ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): + elif not using_infer_string and ( + (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + or pa.types.is_duration(pa_dtype) + ): request.applymarker( pytest.mark.xfail( reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", @@ -304,25 +304,6 @@ def test_astype_str(self, data, request): ) super().test_astype_str(data) - @pytest.mark.parametrize( - "nullable_string_dtype", - [ - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], - ) - def test_astype_string(self, data, nullable_string_dtype, request): - pa_dtype = data.dtype.pyarrow_dtype - if ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): - request.applymarker( - pytest.mark.xfail( - reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", - ) - ) - super().test_astype_string(data, nullable_string_dtype) - def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 9c27e76de91b2..ca3764ac87e95 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -169,21 +169,21 @@ def test_astype_str(self): "d": list(map(str, d._values)), "e": list(map(str, e._values)), }, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) - def test_astype_str_float(self): + def test_astype_str_float(self, using_infer_string): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"], dtype="object") + expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val], dtype="object") + expected = DataFrame([val], dtype="str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -285,7 +285,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: Series(vals[:, 0].astype(str), dtype=object), + 0: Series(vals[:, 0].astype(str), dtype="str"), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], @@ -666,9 +666,10 @@ def test_astype_dt64tz(self, timezone_frame): # dt64tz->dt64 deprecated timezone_frame.astype("datetime64[ns]") - def test_astype_dt64tz_to_str(self, timezone_frame): + def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string): # str formatting result = timezone_frame.astype(str) + na_value = np.nan if using_infer_string else "NaT" expected = DataFrame( [ [ @@ -676,7 +677,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], - ["2013-01-02", "NaT", "NaT"], + ["2013-01-02", na_value, na_value], [ "2013-01-03", "2013-01-03 00:00:00-05:00", @@ -684,7 +685,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 875dca321635f..0354e9df3d168 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -99,6 +99,9 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string): ei = df[["a"]] tm.assert_frame_equal(ri, ei) + ri = df.select_dtypes(include=[str]) + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -358,7 +361,7 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): - if using_infer_string and dtype == "str": + if using_infer_string and (dtype == "str" or dtype is str): # this is tested below pytest.skip("Selecting string columns works with future strings") df = DataFrame( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f70d36d110625..fd770b368c9da 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,7 +24,6 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -83,7 +82,7 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str), dtype=object) + expected = DataFrame(arr.astype(str), dtype="str") tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self, using_array_manager): @@ -328,19 +327,39 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies") - def test_1d_object_array_does_not_copy(self): + def test_1d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test - @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") - def test_2d_object_array_does_not_copy(self): + def test_2d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): @@ -1793,12 +1812,18 @@ def test_constructor_column_duplicates(self): tm.assert_frame_equal(idf, edf) - def test_constructor_empty_with_string_dtype(self): + def test_constructor_empty_with_string_dtype(self, using_infer_string): # GH 9428 expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object) + expected_str = DataFrame( + index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan) + ) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str) - tm.assert_frame_equal(df, expected) + if using_infer_string: + tm.assert_frame_equal(df, expected_str) + else: + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index c0bc6601769b1..a9bcae625e494 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -102,13 +102,16 @@ def test_astype_tznaive_to_tzaware(self): # dt64->dt64tz deprecated idx._data.astype("datetime64[ns, US/Eastern]") - def test_astype_str_nat(self): + def test_astype_str_nat(self, using_infer_string): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) result = idx.astype(str) - expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) + if using_infer_string: + expected = Index(["2016-05-16", None, None, None], dtype="str") + else: + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) def test_astype_str(self): @@ -118,7 +121,7 @@ def test_astype_str(self): expected = Index( ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -133,7 +136,7 @@ def test_astype_str_tz_and_name(self): "2012-01-03 00:00:00-05:00", ], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -144,7 +147,7 @@ def test_astype_str_freq_and_name(self): expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -156,7 +159,7 @@ def test_astype_str_freq_and_tz(self): result = dti.astype(str) expected = Index( ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], - dtype=object, + dtype="str", name="test_name", ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 9c1ef302c5b51..ce05b5e9f2238 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -15,12 +15,12 @@ def test_astype_str_from_bytes(): # ensure_string_array which does f"{val}" idx = Index(["あ", b"a"], dtype="object") result = idx.astype(str) - expected = Index(["あ", "a"], dtype="object") + expected = Index(["あ", "a"], dtype="str") tm.assert_index_equal(result, expected) # while we're here, check that Series.astype behaves the same result = Series(idx).astype(str) - expected = Series(expected, dtype=object) + expected = Series(expected, dtype="str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index d545bfd2fae0f..af3c2667f51b4 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -22,7 +22,7 @@ def test_astype_raises(self, dtype): with pytest.raises(TypeError, match=msg): idx.astype(dtype) - def test_astype_conversion(self): + def test_astype_conversion(self, using_infer_string): # GH#13149, GH#13209 idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") @@ -41,7 +41,12 @@ def test_astype_conversion(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) idx = period_range("1990", "2009", freq="Y", name="idx") diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 311f2b5c9aa59..5166cadae499e 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -44,7 +44,7 @@ def test_astype_object_with_nat(self): tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list - def test_astype(self): + def test_astype(self, using_infer_string): # GH 13149, GH 13209 idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") @@ -61,7 +61,12 @@ def test_astype(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index b3af8def191ec..ef94c4c7aff2c 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -412,6 +412,7 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8dc76d8f747cb..3c5e1e1cf5afb 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -550,7 +550,7 @@ def test_reader_dtype(self, read_ext): expected["a"] = expected["a"].astype("float64") expected["b"] = expected["b"].astype("float32") - expected["c"] = Series(["001", "002", "003", "004"], dtype=object) + expected["c"] = Series(["001", "002", "003", "004"], dtype="str") tm.assert_frame_equal(actual, expected) msg = "Unable to convert column d to type int64" @@ -577,9 +577,9 @@ def test_reader_dtype(self, read_ext): { "a": Series([1, 2, 3, 4], dtype="float64"), "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": Series(["001", "002", "003", "004"], dtype=object), - "d": Series(["1", "2", np.nan, "4"], dtype=object), - } + "c": Series(["001", "002", "003", "004"], dtype="str"), + "d": Series(["1", "2", np.nan, "4"], dtype="str"), + }, ), ), ], diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index bc7b21baaeec5..787941c5d0376 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -28,7 +28,7 @@ @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_all_columns(all_parsers, dtype, check_orig): +def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): # see gh-3795, gh-6607 parser = all_parsers @@ -46,8 +46,10 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): if check_orig: expected = df.copy() result = result.astype(float) - else: + elif using_infer_string and dtype is str: expected = df.astype(str) + else: + expected = df.astype(str).astype(object) tm.assert_frame_equal(result, expected) @@ -300,7 +302,6 @@ def test_true_values_cast_to_bool(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): @@ -316,7 +317,6 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_mangle_dup_cols_single_dtype(all_parsers): # GH#42022 @@ -565,7 +565,7 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) -def test_string_inference_object_dtype(all_parsers, dtype): +def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): # GH#56047 data = """a,b x,a @@ -575,10 +575,11 @@ def test_string_inference_object_dtype(all_parsers, dtype): with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data), dtype=dtype) + expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), + "b": pd.Series(["a", "a", "a"], dtype=expected_dtype), }, columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) @@ -589,7 +590,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 1a3b7b37bf66b..5f9823f7225f9 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -630,7 +630,6 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): @@ -682,7 +681,6 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): # TODO: this test isn't about the na_values keyword, it is about the empty entries # being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 9e7530906afa3..5f2ddf7de9c6d 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -17,8 +17,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( ParserError, ParserWarning, @@ -498,7 +496,6 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] ) @@ -523,10 +520,11 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d "c": [0, 4000, 131], } ) + if dtype["a"] == object: + expected["a"] = expected["a"].astype(object) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype,expected", [ diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index ef0757ffe4aa8..b9ba03d1e9f41 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class): dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype="str") tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -172,10 +172,14 @@ def test_astype_empty_constructor_equality(self, dtype): ) def test_astype_str_map(self, dtype, series, using_infer_string): # see GH#4405 + using_string_dtype = using_infer_string and dtype is str result = series.astype(dtype) - expected = series.map(str) - if using_infer_string: - expected = expected.astype(object) + if using_string_dtype: + expected = series.map(lambda val: str(val) if val is not np.nan else np.nan) + else: + expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -212,7 +216,7 @@ def test_astype_dt64_to_str(self): # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex dti = date_range("2012-01-01", periods=3) result = Series(dti).astype(str) - expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) + expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype="str") tm.assert_series_equal(result, expected) def test_astype_dt64tz_to_str(self): @@ -225,7 +229,7 @@ def test_astype_dt64tz_to_str(self): "2012-01-02 00:00:00-05:00", "2012-01-03 00:00:00-05:00", ], - dtype=object, + dtype="str", ) tm.assert_series_equal(result, expected) @@ -285,13 +289,13 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"], dtype=object) + expected = Series(["2010-01-04"], dtype="str") tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype="str") tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): @@ -300,7 +304,7 @@ def test_astype_str_cast_td64(self): td = Series([Timedelta(1, unit="d")]) ser = td.astype(str) - expected = Series(["1 days"], dtype=object) + expected = Series(["1 days"], dtype="str") tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -347,7 +351,7 @@ def test_astype_from_float_to_str(self, dtype): # https://github.com/pandas-dev/pandas/issues/36451 ser = Series([0.1], dtype=dtype) result = ser.astype(str) - expected = Series(["0.1"], dtype=object) + expected = Series(["0.1"], dtype="str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -358,11 +362,13 @@ def test_astype_from_float_to_str(self, dtype): (NA, ""), ], ) - def test_astype_to_str_preserves_na(self, value, string_value): + def test_astype_to_str_preserves_na(self, value, string_value, using_infer_string): # https://github.com/pandas-dev/pandas/issues/36904 ser = Series(["a", "b", value], dtype=object) result = ser.astype(str) - expected = Series(["a", "b", string_value], dtype=object) + expected = Series( + ["a", "b", None if using_infer_string else string_value], dtype="str" + ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index ac489b2579e05..e5281a18236da 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -553,13 +553,11 @@ def f(x): (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp, using_infer_string): +def test_map_missing_mixed(vals, mapping, exp): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) exp = Series(exp) - if using_infer_string and mapping == {np.nan: "not NaN"}: - exp.iloc[-1] = np.nan tm.assert_series_equal(result, exp) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 6efe0bcb8b45d..60b2ec7b6912d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -230,7 +230,7 @@ def test_constructor_empty(self, input_class, using_infer_string): # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) if using_infer_string: - empty2 = Series("", index=range(3), dtype=object) + empty2 = Series("", index=range(3), dtype="str") else: empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f3a7ba2607f4a..a7c2ec5acb7c2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1900,13 +1900,16 @@ def test_strobj_mode(self): tm.assert_series_equal(ser.mode(), exp) @pytest.mark.parametrize("dt", [str, object]) - def test_strobj_multi_char(self, dt): + def test_strobj_multi_char(self, dt, using_infer_string): exp = ["bar"] data = ["foo"] * 2 + ["bar"] * 3 ser = Series(data, dtype=dt) exp = Series(exp, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + if using_infer_string and dt is str: + tm.assert_extension_array_equal(algos.mode(ser.values), exp.values) + else: + tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) tm.assert_series_equal(ser.mode(), exp) def test_datelike_mode(self): From 4a9c46b87bd26a96c27fb11fd3889a6698fce4dc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 Sep 2024 21:16:04 +0200 Subject: [PATCH 279/396] String dtype: allow string dtype for non-raw apply with numba engine (#59854) * String dtype: allow string dtype for non-raw apply with numba engine * remove xfails * clean-up --- pandas/core/_numba/extensions.py | 3 ++- pandas/core/apply.py | 5 ----- pandas/tests/apply/test_frame_apply.py | 1 - pandas/tests/apply/test_numba.py | 4 ---- 4 files changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index ee09c9380fb0f..b05f12295a729 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -49,7 +49,8 @@ @contextmanager def set_numba_data(index: Index): numba_data = index._data - if numba_data.dtype == object: + if numba_data.dtype in (object, "string"): + numba_data = np.asarray(numba_data) if not lib.is_string_array(numba_data): raise ValueError( "The numba engine only supports using string or numeric column names" diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 25a71ce5b5f4f..fafc9ee1b6928 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1174,12 +1174,7 @@ def apply_with_numba(self) -> dict[int, Any]: from pandas.core._numba.extensions import set_numba_data index = self.obj.index - if index.dtype == "string": - index = index.astype(object) - columns = self.obj.columns - if columns.dtype == "string": - columns = columns.astype(object) # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 5e0f991d5c406..6a328dfb39be5 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -65,7 +65,6 @@ def test_apply(float_frame, engine, request): assert result.index is float_frame.index -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) def test_apply_args(float_frame, axis, raw, engine, request): diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 83b655f89e247..20c067a776f4d 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -20,7 +18,6 @@ def apply_axis(request): return request.param -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x result = float_frame.apply(func, engine="numba", axis=apply_axis) @@ -43,7 +40,6 @@ def test_numba_vs_python_string_index(): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_indexing(): frame = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, From 2ce0f670b2d85c206be97384941b0f09af1a7ba4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 10 Oct 2024 12:53:16 +0200 Subject: [PATCH 280/396] fixup rank test --- pandas/tests/series/methods/test_rank.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index f0fe1d989941e..1c3ebe5653ce3 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -261,7 +261,11 @@ def test_rank_signature(self): def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): method, exp = results - if dtype == "int64" or (not using_infer_string and dtype == "str"): + if ( + dtype == "int64" + or dtype == "Int64" + or (not using_infer_string and dtype == "str") + ): pytest.skip("int64/str does not support NaN") ser = ser if dtype is None else ser.astype(dtype) From c74b6d21c3e154693b6d332c983805a42d328d99 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 10 Oct 2024 13:24:21 +0200 Subject: [PATCH 281/396] update tests --- pandas/tests/arithmetic/test_object.py | 4 ++++ pandas/tests/groupby/methods/test_describe.py | 6 +++++- pandas/tests/groupby/test_numeric_only.py | 1 + pandas/tests/groupby/transform/test_transform.py | 4 ++-- pandas/tests/indexing/test_iloc.py | 4 +++- pandas/tests/indexing/test_loc.py | 4 ---- pandas/tests/io/test_feather.py | 3 --- pandas/tests/reshape/test_melt.py | 5 ++++- 8 files changed, 19 insertions(+), 12 deletions(-) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index bc0f78d3aa01a..44e485d40ba53 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -183,6 +183,10 @@ def test_objarr_add_invalid(self, op, box_with_array): "unsupported operand type", "must be str", "has no kernel", + "operation 'add' not supported", + "operation 'radd' not supported", + "operation 'sub' not supported", + "operation 'rsub' not supported", ] ) with pytest.raises(Exception, match=msg): diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index c80063e673b81..c0889ab415e74 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -71,7 +71,7 @@ def test_series_describe_as_index(as_index, keys): tm.assert_frame_equal(result, expected) -def test_frame_describe_multikey(tsframe): +def test_frame_describe_multikey(tsframe, using_infer_string): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() desc_groups = [] @@ -87,6 +87,10 @@ def test_frame_describe_multikey(tsframe): expected = pd.concat(desc_groups, axis=1) tm.assert_frame_equal(result, expected) + # remainder of the tests fails with string dtype but is testing deprecated behaviour + if using_infer_string: + return + msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index b1fa541d42086..3b7614347d181 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -273,6 +273,7 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_str # cumsum, diff, pct_change "unsupported operand type", "has no kernel", + "operation 'sub' not supported for dtype 'str' with dtype 'float64'", ) if using_infer_string: pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 5823656a610e5..395036dd400e5 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -513,7 +513,7 @@ def test_transform_nuisance_raises(df, using_infer_string): msg = "Could not convert" if using_infer_string: if df.columns.dtype.storage == "pyarrow": - msg = "with dtype str does not support operation 'mean'" + msg = "with dtype str does not support reduction 'mean'" else: msg = "Cannot perform reduction 'mean' with string dtype" with pytest.raises(TypeError, match=msg): @@ -621,7 +621,7 @@ def test_groupby_transform_with_int(using_infer_string): msg = "Could not convert" if using_infer_string: if HAS_PYARROW: - msg = "with dtype str does not support operation 'mean'" + msg = "with dtype str does not support reduction 'mean'" else: msg = "Cannot perform reduction 'mean' with string dtype" with np.errstate(all="ignore"): diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 45f63bdf1ee32..c2742f42e3a92 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -1221,7 +1221,9 @@ def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string): df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) if using_infer_string: - with pytest.raises(TypeError, match="Invalid value"): + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) else: df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index ad72be02f81b1..bdc6d9aff6f4e 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -16,7 +16,6 @@ from pandas._config import using_string_dtype from pandas._libs import index as libindex -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -1459,9 +1458,6 @@ def test_loc_setitem_listlike_with_timedelta64index(self, indexer, expected): tm.assert_frame_equal(expected, df) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_loc_setitem_categorical_values_partial_column_slice(self): # Assigning a Category to parts of a int/... column uses the values of # the Categorical diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 24fc801de44a7..3b4484e44e155 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,8 +2,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm @@ -148,7 +146,6 @@ def test_path_localpath(self): result = tm.round_trip_localpath(df.to_feather, read_feather) tm.assert_frame_equal(df, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index e58187ba6bcbc..72fd72df60761 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -1199,7 +1199,10 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") - def test_missing_stubname(self, any_string_dtype): + def test_missing_stubname(self, request, any_string_dtype, using_infer_string): + if using_infer_string and any_string_dtype == "object": + # triggers object dtype inference warning of dtype=object + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) # GH46044 df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]}) df = df.astype({"id": any_string_dtype}) From 60b1b7b0957b01c2e0a481d494a8414257f419e7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 10 Oct 2024 13:29:11 +0200 Subject: [PATCH 282/396] fix linting --- pandas/core/dtypes/dtypes.py | 2 +- pandas/tests/extension/test_string.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index c6ca24d19b906..e7efb8598ec61 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1791,7 +1791,7 @@ def _is_na_fill_value(self) -> bool: @property def _is_numeric(self) -> bool: - return not self.subtype == object + return self.subtype != object @property def _is_boolean(self) -> bool: diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 354b4d5333c7d..07c3b4224e76f 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -212,9 +212,6 @@ def test_compare_scalar(self, data, comparison_op): ser = pd.Series(data) self._compare_other(ser, data, comparison_op, "abc") - def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): - super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) - def test_combine_add(self, data_repeated, using_infer_string, request): dtype = next(data_repeated(1)).dtype if using_infer_string and ( From 99be25305766d7785923799b9f7183c680637302 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:17:36 -0700 Subject: [PATCH 283/396] Backport PR #59816 on branch 2.3.x (Bump pypa/cibuildwheel from 2.20.0 to 2.21.0) (#60015) Backport PR #59816: Bump pypa/cibuildwheel from 2.20.0 to 2.21.0 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 41417622c3ef2..8d7706042718b 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -150,7 +150,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.20.0 + uses: pypa/cibuildwheel@v2.21.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From e3302bcecea12375c79f467043a751a509714872 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 11 Oct 2024 10:06:43 +0200 Subject: [PATCH 284/396] [backport 2.3.x] TST (string dtype): resolve all infer_string TODO/xfails in pandas/tests/arrays (#59686) (#60020) TST (string dtype): resolve all infer_string TODO/xfails in pandas/tests/arrays (#59686) (cherry picked from commit 4f328f08df90906198b3a3f955ab321018964f0a) --- pandas/core/arrays/string_arrow.py | 6 +++++- pandas/tests/arrays/categorical/test_analytics.py | 10 ++++++++-- pandas/tests/arrays/integer/test_reduction.py | 7 +------ 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 56f7d3aecce20..8dcf7643b579e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -408,9 +408,13 @@ def _reduce( arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, "")) else: arr = pc.not_equal(self._pa_array, "") - return ArrowExtensionArray(arr)._reduce( + result = ArrowExtensionArray(arr)._reduce( name, skipna=skipna, keepdims=keepdims, **kwargs ) + if keepdims: + # ArrowExtensionArray will return a length-1 bool[pyarrow] array + return result.astype(np.bool_) + return result result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) if name in ("argmin", "argmax") and isinstance(result, pa.Array): diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index c2c53fbc4637e..9a0356cbc422b 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -296,7 +296,7 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp - def test_memory_usage(self): + def test_memory_usage(self, using_infer_string): cat = Categorical([1, 2, 3]) # .categories is an index, so we include the hashtable @@ -304,7 +304,13 @@ def test_memory_usage(self): assert 0 < cat.nbytes <= cat.memory_usage(deep=True) cat = Categorical(["foo", "foo", "bar"]) - assert cat.memory_usage(deep=True) > cat.nbytes + if using_infer_string: + if cat.categories.dtype.storage == "python": + assert cat.memory_usage(deep=True) > cat.nbytes + else: + assert cat.memory_usage(deep=True) >= cat.nbytes + else: + assert cat.memory_usage(deep=True) > cat.nbytes if not PYPY: # sys.getsizeof will call the .memory_usage with diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index e485c7f79b475..1c91cd25ba69c 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import ( DataFrame, @@ -104,10 +102,7 @@ def test_groupby_reductions(op, expected): ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], ) -def test_mixed_reductions(request, op, expected, using_infer_string): - if op in ["any", "all"] and using_infer_string and HAS_PYARROW: - # TODO(infer_string) inconsistent result type - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) +def test_mixed_reductions(op, expected): df = DataFrame( { "A": ["a", "b", "b"], From a24a6534218af441eab3d7263464d58d659c0229 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 11 Oct 2024 10:07:10 +0200 Subject: [PATCH 285/396] [backport 2.3.x] String dtype: propagate NaNs as False in predicate methods (eg .str.startswith) (#59616) (#60014) * String dtype: propagate NaNs as False in predicate methods (eg .str.startswith) (#59616) (cherry picked from commit 88554d0ca77c7b80605a34f9ece838b834db8720) * ignore object dtype inference warnings --- pandas/core/arrays/_arrow_string_mixins.py | 44 +++-- pandas/core/arrays/arrow/array.py | 6 +- pandas/core/arrays/categorical.py | 20 ++- pandas/core/arrays/string_.py | 33 ++-- pandas/core/arrays/string_arrow.py | 42 +++-- pandas/core/strings/accessor.py | 40 +++-- pandas/core/strings/base.py | 10 +- pandas/core/strings/object_array.py | 33 ++-- pandas/tests/strings/test_api.py | 9 +- pandas/tests/strings/test_find_replace.py | 193 ++++++++++++++------- pandas/tests/strings/test_string_array.py | 2 +- pandas/tests/strings/test_strings.py | 30 +++- 12 files changed, 316 insertions(+), 146 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index a39668faf779e..e136b4f92031d 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -10,6 +10,7 @@ import numpy as np +from pandas._libs import lib from pandas.compat import ( pa_version_under10p1, pa_version_under11p0, @@ -17,8 +18,6 @@ pa_version_under17p0, ) -from pandas.core.dtypes.missing import isna - if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc @@ -38,7 +37,7 @@ class ArrowStringArrayMixin: def __init__(self, *args, **kwargs) -> None: raise NotImplementedError - def _convert_bool_result(self, result): + def _convert_bool_result(self, result, na=lib.no_default, method_name=None): # Convert a bool-dtype result to the appropriate result type raise NotImplementedError @@ -212,7 +211,9 @@ def _str_removesuffix(self, suffix: str): result = pc.if_else(ends_with, removed, self._pa_array) return type(self)(result) - def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + def _str_startswith( + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default + ): if isinstance(pat, str): result = pc.starts_with(self._pa_array, pattern=pat) else: @@ -225,11 +226,11 @@ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): for p in pat[1:]: result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): # pyright: ignore [reportGeneralTypeIssues] - result = result.fill_null(na) - return self._convert_bool_result(result) + return self._convert_bool_result(result, na=na, method_name="startswith") - def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + def _str_endswith( + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default + ): if isinstance(pat, str): result = pc.ends_with(self._pa_array, pattern=pat) else: @@ -242,9 +243,7 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): for p in pat[1:]: result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): # pyright: ignore [reportGeneralTypeIssues] - result = result.fill_null(na) - return self._convert_bool_result(result) + return self._convert_bool_result(result, na=na, method_name="endswith") def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) @@ -283,7 +282,12 @@ def _str_isupper(self): return self._convert_bool_result(result) def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, + regex: bool = True, ): if flags: raise NotImplementedError(f"contains not implemented with {flags=}") @@ -293,19 +297,25 @@ def _str_contains( else: pa_contains = pc.match_substring result = pa_contains(self._pa_array, pat, ignore_case=not case) - if not isna(na): # pyright: ignore [reportGeneralTypeIssues] - result = result.fill_null(na) - return self._convert_bool_result(result) + return self._convert_bool_result(result, na=na, method_name="contains") def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): if not pat.startswith("^"): pat = f"^{pat}" return self._str_contains(pat, case, flags, na, regex=True) def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + self, + pat, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e0ccbd6fdc5fd..f3d7a3cc6d694 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2285,7 +2285,11 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]: for chunk in self._pa_array.iterchunks() ] - def _convert_bool_result(self, result): + def _convert_bool_result(self, result, na=lib.no_default, method_name=None): + if na is not lib.no_default and not isna( + na + ): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) return type(self)(result) def _convert_int_result(self, result): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 97004474648b2..366253a923f6c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2675,16 +2675,28 @@ def _replace(self, *, to_replace, value, inplace: bool = False): # ------------------------------------------------------------------------ # String methods interface def _str_map( - self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True + self, f, na_value=lib.no_default, dtype=np.dtype("object"), convert: bool = True ): # Optimization to apply the callable `f` to the categories once # and rebuild the result by `take`ing from the result with the codes. # Returns the same type as the object-dtype implementation though. - from pandas.core.arrays import NumpyExtensionArray - categories = self.categories codes = self.codes - result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) + if categories.dtype == "string": + result = categories.array._str_map(f, na_value, dtype) # type: ignore[attr-defined] + if ( + categories.dtype.na_value is np.nan # type: ignore[union-attr] + and is_bool_dtype(dtype) + and (na_value is lib.no_default or isna(na_value)) + ): + # NaN propagates as False for functions with boolean return type + na_value = False + else: + from pandas.core.arrays import NumpyExtensionArray + + result = NumpyExtensionArray(categories.to_numpy())._str_map( + f, na_value, dtype + ) return take_nd(result, codes, fill_value=na_value) def _str_get_dummies(self, sep: str = "|"): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0b0fffcb928a3..5b69344bac0c8 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -377,7 +377,11 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: return cls._from_sequence(scalars, dtype=dtype) def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + self, + f, + na_value=lib.no_default, + dtype: Dtype | None = None, + convert: bool = True, ): if self.dtype.na_value is np.nan: return self._str_map_nan_semantics( @@ -388,7 +392,7 @@ def _str_map( if dtype is None: dtype = self.dtype - if na_value is None: + if na_value is lib.no_default: na_value = self.dtype.na_value mask = isna(self) @@ -458,12 +462,20 @@ def _str_map_str_or_object( return lib.map_infer_mask(arr, f, mask.view("uint8")) def _str_map_nan_semantics( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + self, + f, + na_value=lib.no_default, + dtype: Dtype | None = None, + convert: bool = True, ): if dtype is None: dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value + if na_value is lib.no_default: + if is_bool_dtype(dtype): + # NaN propagates as False + na_value = False + else: + na_value = self.dtype.na_value mask = isna(self) arr = np.asarray(self) @@ -474,7 +486,8 @@ def _str_map_nan_semantics( if is_integer_dtype(dtype): na_value = 0 else: - na_value = True + # NaN propagates as False + na_value = False result = lib.map_infer_mask( arr, @@ -484,15 +497,13 @@ def _str_map_nan_semantics( na_value=na_value, dtype=np.dtype(cast(type, dtype)), ) - if na_value_is_na and mask.any(): + if na_value_is_na and is_integer_dtype(dtype) and mask.any(): # TODO: we could alternatively do this check before map_infer_mask # and adjust the dtype/na_value we pass there. Which is more # performant? - if is_integer_dtype(dtype): - result = result.astype("float64") - else: - result = result.astype("object") + result = result.astype("float64") result[mask] = np.nan + return result else: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8dcf7643b579e..9389f7cffca9f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -211,9 +211,29 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - def _convert_bool_result(self, values): + def _convert_bool_result(self, values, na=lib.no_default, method_name=None): + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + f"Allowing a non-bool 'na' in obj.str.{method_name} is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + na = bool(na) + if self.dtype.na_value is np.nan: - return ArrowExtensionArray(values).to_numpy(na_value=np.nan) + if na is lib.no_default or isna(na): + # NaN propagates as False + values = values.fill_null(False) + else: + values = values.fill_null(na) + return values.to_numpy() + else: + if na is not lib.no_default and not isna( + na + ): # pyright: ignore [reportGeneralTypeIssues] + values = values.fill_null(na) return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -309,22 +329,16 @@ def _data(self): _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): if flags: return super()._str_contains(pat, case, flags, na, regex) - if not isna(na): - if not isinstance(na, bool): - # GH#59561 - warnings.warn( - "Allowing a non-bool 'na' in obj.str.contains is deprecated " - "and will raise in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - na = bool(na) - return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) def _str_replace( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index da10a12d02ae4..563dce3008480 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1199,7 +1199,12 @@ def join(self, sep: str): @forbid_nonstring_types(["bytes"]) def contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): r""" Test if pattern or regex is contained within a string of a Series or Index. @@ -1217,8 +1222,9 @@ def contains( Flags to pass through to the re module, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. regex : bool, default True If True, assumes the pat is a regular expression. @@ -1336,7 +1342,7 @@ def contains( return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def match(self, pat: str, case: bool = True, flags: int = 0, na=None): + def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string starts with a match of a regular expression. @@ -1350,8 +1356,9 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=None): Regex module flags, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. Returns ------- @@ -1377,7 +1384,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=None): return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): + def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string entirely matches a regular expression. @@ -1391,8 +1398,9 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): Regex module flags, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. Returns ------- @@ -2415,7 +2423,7 @@ def count(self, pat, flags: int = 0): @forbid_nonstring_types(["bytes"]) def startswith( - self, pat: str | tuple[str, ...], na: Scalar | None = None + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ) -> Series | Index: """ Test if the start of each string element matches a pattern. @@ -2427,10 +2435,11 @@ def startswith( pat : str or tuple[str, ...] Character sequence or tuple of strings. Regular expressions are not accepted. - na : object, default NaN + na : scalar, optional Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. + For the nullable ``StringDtype``, ``pandas.NA`` is used. + For the ``"str"`` dtype, ``False`` is used. Returns ------- @@ -2485,7 +2494,7 @@ def startswith( @forbid_nonstring_types(["bytes"]) def endswith( - self, pat: str | tuple[str, ...], na: Scalar | None = None + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ) -> Series | Index: """ Test if the end of each string element matches a pattern. @@ -2497,10 +2506,11 @@ def endswith( pat : str or tuple[str, ...] Character sequence or tuple of strings. Regular expressions are not accepted. - na : object, default NaN + na : scalar, optional Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. + For the nullable ``StringDtype``, ``pandas.NA`` is used. + For the ``"str"`` dtype, ``False`` is used. Returns ------- diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 96b0352666b41..316c86d152db3 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -7,7 +7,7 @@ Literal, ) -import numpy as np +from pandas._libs import lib if TYPE_CHECKING: from collections.abc import Sequence @@ -85,7 +85,11 @@ def _str_repeat(self, repeats: int | Sequence[int]): @abc.abstractmethod def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): pass @@ -95,7 +99,7 @@ def _str_fullmatch( pat: str | re.Pattern, case: bool = True, flags: int = 0, - na: Scalar = np.nan, + na: Scalar | lib.NoDefault = lib.no_default, ): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index f376c239a0ce0..e82c6c20e86d9 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -44,7 +44,11 @@ def __len__(self) -> int: raise NotImplementedError def _str_map( - self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True + self, + f, + na_value=lib.no_default, + dtype: NpDtype | None = None, + convert: bool = True, ): """ Map a callable over valid elements of the array. @@ -65,7 +69,7 @@ def _str_map( """ if dtype is None: dtype = np.dtype("object") - if na_value is None: + if na_value is lib.no_default: na_value = self.dtype.na_value # type: ignore[attr-defined] if not len(self): @@ -127,7 +131,12 @@ def _str_pad( return self._str_map(f) def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): if regex: if not case: @@ -142,7 +151,7 @@ def _str_contains( else: upper_pat = pat.upper() f = lambda x: upper_pat in x.upper() - if not isna(na) and not isinstance(na, bool): + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): # GH#59561 warnings.warn( "Allowing a non-bool 'na' in obj.str.contains is deprecated " @@ -152,9 +161,9 @@ def _str_contains( ) return self._str_map(f, na, dtype=np.dtype("bool")) - def _str_startswith(self, pat, na=None): + def _str_startswith(self, pat, na=lib.no_default): f = lambda x: x.startswith(pat) - if not isna(na) and not isinstance(na, bool): + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): # GH#59561 warnings.warn( "Allowing a non-bool 'na' in obj.str.startswith is deprecated " @@ -164,9 +173,9 @@ def _str_startswith(self, pat, na=None): ) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - def _str_endswith(self, pat, na=None): + def _str_endswith(self, pat, na=lib.no_default): f = lambda x: x.endswith(pat) - if not isna(na) and not isinstance(na, bool): + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): # GH#59561 warnings.warn( "Allowing a non-bool 'na' in obj.str.endswith is deprecated " @@ -235,7 +244,11 @@ def rep(x, r): return result def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): if not case: flags |= re.IGNORECASE @@ -250,7 +263,7 @@ def _str_fullmatch( pat: str | re.Pattern, case: bool = True, flags: int = 0, - na: Scalar | None = None, + na: Scalar | lib.NoDefault = lib.no_default, ): if not case: flags |= re.IGNORECASE diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 31e005466af7b..8987fc36656c5 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -111,6 +111,7 @@ def test_api_per_method( any_allowed_skipna_inferred_dtype, any_string_method, request, + using_infer_string, ): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, @@ -149,6 +150,10 @@ def test_api_per_method( t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) + if using_infer_string and dtype == "category": + string_allowed = method_name not in ["decode"] + else: + string_allowed = True bytes_allowed = method_name in ["decode", "get", "len", "slice"] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. @@ -157,7 +162,8 @@ def test_api_per_method( mixed_allowed = method_name not in ["cat"] allowed_types = ( - ["string", "unicode", "empty"] + ["empty"] + + ["string", "unicode"] * string_allowed + ["bytes"] * bytes_allowed + ["mixed", "mixed-integer"] * mixed_allowed ) @@ -171,6 +177,7 @@ def test_api_per_method( msg = ( f"Cannot use .str.{method_name} with values of " f"inferred dtype {repr(inferred_dtype)}." + "|a bytes-like object is required, not 'str'" ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 2742c5b67e57e..48159c07de6ab 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -29,20 +29,28 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series( - np.array([False, np.nan, True, True, False], dtype=np.object_), - dtype=expected_dtype, - ) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, True, True, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series( + np.array([False, np.nan, True, True, False], dtype=np.object_), + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) result = values.str.contains(pat, regex=False) - expected = Series( - np.array([False, np.nan, False, False, True], dtype=np.object_), - dtype=expected_dtype, - ) + if any_string_dtype == "str": + expected = Series([False, False, False, False, True], dtype=bool) + else: + expected = Series( + np.array([False, np.nan, False, False, True], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) values = Series( @@ -79,12 +87,16 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series( - np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype - ) + if any_string_dtype == "str": + expected = Series([False, False, True, True], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series( + np.array([False, np.nan, True, True], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) @@ -184,39 +196,45 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan expected = Series( - [False, False, False, True, True, False, np.nan, False, False, True], + [False, False, False, True, True, False, na_value, False, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("a", case=False) expected = Series( - [True, False, False, True, True, False, np.nan, True, False, True], + [True, False, False, True, True, False, na_value, True, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("Aa") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], + [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], + [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba", case=False) expected = Series( - [False, False, False, True, True, False, np.nan, True, False, False], + [False, False, False, True, True, False, na_value, True, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) @@ -261,10 +279,14 @@ def test_contains_nan(any_string_dtype): tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -287,9 +309,7 @@ def test_startswith_endswith_validate_na(request, any_string_dtype): ) dtype = ser.dtype - if ( - isinstance(dtype, pd.StringDtype) and dtype.storage == "python" - ) or dtype == np.dtype("object"): + if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"): msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): ser.str.startswith("kapow", na="baz") @@ -307,11 +327,12 @@ def test_startswith_endswith_validate_na(request, any_string_dtype): ser.str.endswith("kapow", na="baz") +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_startswith(pat, dtype, null_value, na): +def test_startswith(pat, dtype, null_value, na, using_infer_string): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], @@ -325,6 +346,8 @@ def test_startswith(pat, dtype, null_value, na): exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None + elif using_infer_string and dtype == "category": + exp = exp.fillna(False).astype(bool) tm.assert_series_equal(result, exp) result = values.str.startswith(pat, na=na) @@ -342,20 +365,31 @@ def test_startswith(pat, dtype, null_value, na): @pytest.mark.parametrize("na", [None, True, False]) -def test_startswith_nullable_string_dtype(nullable_string_dtype, na): +def test_startswith_string_dtype(any_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], - dtype=nullable_string_dtype, + dtype=any_string_dtype, ) result = values.str.startswith("foo", na=na) + + expected_dtype = ( + (object if na is None else bool) + if is_object_or_nan_string_dtype(any_string_dtype) + else "boolean" + ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + if na is None: + na = False exp = Series( - [False, na, True, False, False, na, True, False, False], dtype="boolean" + [False, na, True, False, False, na, True, False, False], dtype=expected_dtype ) tm.assert_series_equal(result, exp) result = values.str.startswith("rege.", na=na) exp = Series( - [False, na, False, False, False, na, False, False, True], dtype="boolean" + [False, na, False, False, False, na, False, False, True], dtype=expected_dtype ) tm.assert_series_equal(result, exp) @@ -365,11 +399,12 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): # -------------------------------------------------------------------------------------- +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_endswith(pat, dtype, null_value, na): +def test_endswith(pat, dtype, null_value, na, using_infer_string): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], @@ -383,6 +418,8 @@ def test_endswith(pat, dtype, null_value, na): exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None + elif using_infer_string and dtype == "category": + exp = exp.fillna(False).astype(bool) tm.assert_series_equal(result, exp) result = values.str.endswith(pat, na=na) @@ -400,20 +437,30 @@ def test_endswith(pat, dtype, null_value, na): @pytest.mark.parametrize("na", [None, True, False]) -def test_endswith_nullable_string_dtype(nullable_string_dtype, na): +def test_endswith_string_dtype(any_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], - dtype=nullable_string_dtype, + dtype=any_string_dtype, ) result = values.str.endswith("foo", na=na) + expected_dtype = ( + (object if na is None else bool) + if is_object_or_nan_string_dtype(any_string_dtype) + else "boolean" + ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + if na is None: + na = False exp = Series( - [False, na, False, False, True, na, True, False, False], dtype="boolean" + [False, na, False, False, True, na, True, False, False], dtype=expected_dtype ) tm.assert_series_equal(result, exp) result = values.str.endswith("rege.", na=na) exp = Series( - [False, na, False, False, False, na, False, False, True], dtype="boolean" + [False, na, False, False, False, na, False, False, True], dtype=expected_dtype ) tm.assert_series_equal(result, exp) @@ -692,36 +739,41 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): - # New match behavior introduced in 0.13 - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") - expected = Series([True, np.nan, False], dtype=expected_dtype) + expected = Series([True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match(".*BAD[_]+.*BAD") - expected = Series([True, True, np.nan, False], dtype=expected_dtype) + expected = Series([True, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("BAD[_]+.*BAD") - expected = Series([False, True, np.nan, False], dtype=expected_dtype) + expected = Series([False, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match("^BAD[_]+.*BAD") - expected = Series([False, False, np.nan, False], dtype=expected_dtype) + expected = Series([False, False, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("\\^BAD[_]+.*BAD") - expected = Series([False, True, np.nan, False], dtype=expected_dtype) + expected = Series([False, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -757,10 +809,17 @@ def test_match_na_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([True, False, np.nan], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan + + expected = Series([True, False, na_value], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -785,10 +844,14 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([True, False, np.nan, False], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([True, False, False, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -796,10 +859,14 @@ def test_fullmatch_dollar_literal(any_string_dtype): # GH 56652 ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) result = ser.str.fullmatch("foo\\$") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([False, False, np.nan, True], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, False, True], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([False, False, np.nan, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 517ddb164985c..cd3c512328139 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -38,7 +38,7 @@ def test_string_array(nullable_string_dtype, any_string_method): expected.values, skipna=True ): assert result.dtype == "boolean" - result = result.astype(object) + expected = expected.astype("boolean") elif expected.dtype == "bool": assert result.dtype == "boolean" diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 40b6c69dc8025..7c396e65b6120 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -217,8 +217,21 @@ def test_ismethods(method, expected, any_string_dtype): tm.assert_series_equal(result, expected) # compare with standard library - expected = [getattr(item, method)() for item in ser] - assert list(result) == expected + expected_stdlib = [getattr(item, method)() for item in ser] + assert list(result) == expected_stdlib + + # with missing value + ser.iloc[[1, 2, 3, 4]] = np.nan + result = getattr(ser.str, method)() + if ser.dtype == "object": + expected = expected.astype(object) + expected.iloc[[1, 2, 3, 4]] = np.nan + elif ser.dtype == "str": + # NaN propagates as False + expected.iloc[[1, 2, 3, 4]] = False + else: + # nullable dtypes propagate NaN + expected.iloc[[1, 2, 3, 4]] = np.nan @pytest.mark.parametrize( @@ -248,6 +261,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): assert list(result) == expected +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize( "method, expected", [ @@ -258,10 +272,14 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series(expected, dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series(expected, dtype=object).fillna(False).astype(bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) From 84a37f75d341f886bc1deb26e85a14f3882fd9d5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 27 Oct 2024 06:36:48 -0700 Subject: [PATCH 286/396] Backport PR #60109 on branch 2.3.x (CI/TST: Update pyreadstat tests and pin xarray on CI) (#60110) Backport PR #60109: CI/TST: Update pyreadstat tests and pin xarray on CI Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-312.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- environment.yml | 2 +- pandas/tests/io/test_spss.py | 4 +++- requirements-dev.txt | 2 +- 8 files changed, 10 insertions(+), 8 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index d0e788d1b124f..7cb2d8171c0cb 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 7fda383dd9e1d..d8c3e6e220630 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -54,7 +54,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index c72d743bf3375..9a0cb5ab81d23 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 032bd68c09ad6..bf79d14cd1b78 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 36c584bf1fd10..5433d00bb94b5 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -53,7 +53,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <2024.10.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/environment.yml b/environment.yml index 58eb69ad1f070..8987623bd865e 100644 --- a/environment.yml +++ b/environment.yml @@ -54,7 +54,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index e118c90d9bc02..82613b4e80725 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -161,4 +161,6 @@ def test_spss_metadata(datapath): "modification_time": datetime.datetime(2015, 2, 6, 14, 33, 36), } ) - assert df.attrs == metadata + if Version(pyreadstat.__version__) >= Version("1.2.8"): + metadata["mr_sets"] = {} + tm.assert_dict_equal(df.attrs, metadata) diff --git a/requirements-dev.txt b/requirements-dev.txt index 5a63e59e1db88..712b5e01257ff 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -43,7 +43,7 @@ s3fs>=2022.11.0 scipy>=1.10.0 SQLAlchemy>=2.0.0 tabulate>=0.9.0 -xarray>=2022.12.0 +xarray>=2022.12.0, <=2024.9.0 xlrd>=2.0.1 xlsxwriter>=3.0.5 zstandard>=0.19.0 From a298795487ac825888e564505307f42d2b05ef9d Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:38:32 -0400 Subject: [PATCH 287/396] DEPR: Update groupby.apply DeprecationWarning to FutureWarning (#59751) * DEPR: Update groupby.apply DeprecationWarning to FutureWarning * Remove xfail * Add whatsnew note --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/core/resample.py | 2 +- pandas/tests/extension/base/groupby.py | 4 +- pandas/tests/frame/test_stack_unstack.py | 2 +- pandas/tests/groupby/aggregate/test_other.py | 4 +- .../groupby/methods/test_value_counts.py | 2 +- pandas/tests/groupby/test_apply.py | 126 +++++++++--------- pandas/tests/groupby/test_apply_mutate.py | 14 +- pandas/tests/groupby/test_categorical.py | 6 +- pandas/tests/groupby/test_counting.py | 2 +- pandas/tests/groupby/test_groupby.py | 32 ++--- pandas/tests/groupby/test_groupby_dropna.py | 2 +- pandas/tests/groupby/test_groupby_subclass.py | 4 +- pandas/tests/groupby/test_grouping.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 10 +- .../tests/groupby/transform/test_transform.py | 4 +- pandas/tests/resample/test_datetime_index.py | 10 +- pandas/tests/resample/test_resample_api.py | 2 +- .../tests/resample/test_resampler_grouper.py | 39 +++--- pandas/tests/resample/test_time_grouper.py | 2 +- pandas/tests/window/test_groupby.py | 32 ++--- 22 files changed, 151 insertions(+), 154 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 01c2ed3821d7a..00503766b062f 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -54,7 +54,7 @@ notable_bug_fix1 Deprecations ~~~~~~~~~~~~ - Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`) -- +- The deprecation of setting the argument ``include_groups`` to ``True`` in :meth:`DataFrameGroupBy.apply` has been promoted from a ``DeprecationWarning`` to ``FutureWarning``; only ``False`` will be allowed (:issue:`7155`) .. --------------------------------------------------------------------------- .. _whatsnew_230.performance: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index db8949788567b..296a601288f9d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1831,7 +1831,7 @@ def f(g): message=_apply_groupings_depr.format( type(self).__name__, "apply" ), - category=DeprecationWarning, + category=FutureWarning, stacklevel=find_stack_level(), ) except TypeError: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0dd808a0ab296..229595202cccb 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2913,7 +2913,7 @@ def _apply( new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") with rewrite_warning( target_message=target_message, - target_category=DeprecationWarning, + target_category=FutureWarning, new_message=new_message, ): result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 414683b02dcba..6947e672f3d44 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -114,11 +114,11 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 2c3e9c1d5e327..8bb5eb2d5c57a 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1832,7 +1832,7 @@ def test_unstack_bug(self, future_stack): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 35ee6c388d5a8..5904b2f48359e 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -505,7 +505,7 @@ def test_agg_timezone_round_trip(): # GH#27110 applying iloc should return a DataFrame msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] @@ -513,7 +513,7 @@ def test_agg_timezone_round_trip(): # GH#27110 applying iloc should return a DataFrame msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index d8c6c7c3fe50c..476ce1fe1b8cc 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -337,7 +337,7 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - warn = DeprecationWarning if groupby == "column" else None + warn = FutureWarning if groupby == "column" else None msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): expected = gp.apply( diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index cc736f2bf53ba..8ee38a688a1a0 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -30,7 +30,7 @@ def store(group): groups.append(group) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby("index").apply(store) expected_value = DataFrame( {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) @@ -114,7 +114,7 @@ def test_apply_index_date_object(): exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date") expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("date", group_keys=False).apply( lambda x: x["time"][x["value"].idxmax()] ) @@ -226,7 +226,7 @@ def f_constant_df(group): del names[:] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby("a", group_keys=False).apply(func) assert names == group_names @@ -246,7 +246,7 @@ def test_group_apply_once_per_group2(capsys): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby("group_by_column", group_keys=False).apply( lambda df: print("function_called") ) @@ -270,9 +270,9 @@ def fast(group): return group.copy() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): fast_df = df.groupby("A", group_keys=False).apply(fast) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) @@ -296,7 +296,7 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("g", group_keys=False).apply(func) tm.assert_frame_equal(result, df) @@ -341,9 +341,9 @@ def test_groupby_as_index_apply(): tm.assert_index_equal(res_not_as, exp) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): res_as_apply = g_as.apply(lambda x: x.head(2)).index - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering @@ -358,7 +358,7 @@ def test_groupby_as_index_apply(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -389,17 +389,17 @@ def desc3(group): return result msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -431,7 +431,7 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -444,7 +444,7 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) @@ -455,7 +455,7 @@ def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan @@ -480,7 +480,7 @@ def trans2(group): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) @@ -511,7 +511,7 @@ def test_apply_chunk_view(group_keys): df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) if group_keys: @@ -534,7 +534,7 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): grouped.apply(lambda x: x.sort_values("value", inplace=True)) @@ -553,7 +553,7 @@ def f(group): return group msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() @@ -579,7 +579,7 @@ def f(group): return group msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() @@ -619,9 +619,9 @@ def filt2(x): return x[x.category == "c"] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = data.groupby("id_field").apply(filt1) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -642,7 +642,7 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): tm.assert_series_equal(result, expected) else: msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("Y", group_keys=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis @@ -689,7 +689,7 @@ def f(g): return g msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = grouped.apply(f) assert "value3" in result @@ -705,11 +705,11 @@ def test_apply_numeric_coercion_when_datetime(): {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) @@ -722,7 +722,7 @@ def get_B(g): return g.iloc[0][["B"]] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A @@ -749,9 +749,9 @@ def predictions(tool): df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = df1.groupby("Key").apply(predictions).p1 - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -768,7 +768,7 @@ def test_apply_aggregating_timedelta_and_datetime(): ) df["time_delta_zero"] = df.datetime - df.datetime msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("clientid").apply( lambda ddf: Series( {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} @@ -817,13 +817,13 @@ def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1] @@ -869,7 +869,7 @@ def test_func(x): pass msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = test_df.groupby("groups").apply(test_func) expected = DataFrame() tm.assert_frame_equal(result, expected) @@ -886,9 +886,9 @@ def test_func(x): return x.iloc[[0, -1]] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result1 = test_df1.groupby("groups").apply(test_func) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result2 = test_df2.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) @@ -903,7 +903,7 @@ def test_groupby_apply_return_empty_chunk(): df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], @@ -932,7 +932,7 @@ def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) @@ -951,7 +951,7 @@ def test_apply_datetime_issue(group_column_dtlike): df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42]) @@ -990,7 +990,7 @@ def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" @@ -1033,7 +1033,7 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): columns=["observation", "color", "mood", "intensity", "score"], ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object expected = Series( @@ -1056,7 +1056,7 @@ def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("group", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -1081,7 +1081,7 @@ def test_apply_function_returns_non_pandas_non_scalar(function, expected_values) # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -1095,7 +1095,7 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") @@ -1108,7 +1108,7 @@ def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], @@ -1146,7 +1146,7 @@ def test_apply_result_type(group_keys, udf): # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) @@ -1163,9 +1163,9 @@ def test_result_order_group_keys_false(): # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A", group_keys=False).apply(lambda x: x) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1179,11 +1179,11 @@ def test_apply_with_timezones_aware(): df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result1 = df1.groupby("x", group_keys=False).apply( lambda df: df[["x", "y"]].copy() ) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result2 = df2.groupby("x", group_keys=False).apply( lambda df: df[["x", "y"]].copy() ) @@ -1242,7 +1242,7 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): grp = df.groupby(["A", "B"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] @@ -1292,7 +1292,7 @@ def test_apply_dropna_with_indexed_same(dropna): index=list("xxyxz"), ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1320,7 +1320,7 @@ def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1332,7 +1332,7 @@ def test_sort_index_groups(): index=range(5), ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), @@ -1354,7 +1354,7 @@ def test_positional_slice_groups_datetimelike(): } ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = expected.groupby( [expected.let, expected.date.dt.date], group_keys=False ).apply(lambda x: x.iloc[0:]) @@ -1401,9 +1401,9 @@ def test_apply_na(dropna): ) dfgrp = df.groupby("grp", dropna=dropna) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) @@ -1411,7 +1411,7 @@ def test_apply_na(dropna): def test_apply_empty_string_nan_coerce_bug(): # GH#24903 msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = ( DataFrame( { @@ -1448,7 +1448,7 @@ def test_apply_index_key_error_bug(index_values): index=Index(["a2", "a3", "aa"], name="a"), ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = result.groupby("a").apply( lambda df: Series([df["b"].mean()], index=["b_mean"]) ) @@ -1500,7 +1500,7 @@ def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 expected = DataFrame({"col": arg}, index=idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = expected.groupby("col", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, expected) @@ -1553,7 +1553,7 @@ def test_include_groups(include_groups): # GH#7155 df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) gb = df.groupby("a") - warn = DeprecationWarning if include_groups else None + warn = FutureWarning if include_groups else None msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): result = gb.apply(lambda x: x.sum(), include_groups=include_groups) @@ -1589,11 +1589,11 @@ def test_builtins_apply(keys, f): npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = gb.apply(npfunc) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected2 = gb.apply(lambda x: npfunc(x)) tm.assert_frame_equal(result, expected2) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index cfd1a4bca9d91..130a29abf9443 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -14,12 +14,12 @@ def test_group_by_copy(): ).set_index("name") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): grp_by_same_value = df.groupby(["age"], group_keys=False).apply( lambda group: group ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): grp_by_copy = df.groupby(["age"], group_keys=False).apply( lambda group: group.copy() ) @@ -54,9 +54,9 @@ def f_no_copy(x): return x.groupby("cat2")["rank"].min() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): grpby_copy = df.groupby("cat1").apply(f_copy) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): grpby_no_copy = df.groupby("cat1").apply(f_no_copy) tm.assert_series_equal(grpby_copy, grpby_no_copy) @@ -68,9 +68,9 @@ def test_no_mutate_but_looks_like(): df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) tm.assert_series_equal(result1, result2) @@ -87,7 +87,7 @@ def fn(x): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning( - DeprecationWarning, match=msg, raise_on_extra_warnings=not warn_copy_on_write + FutureWarning, match=msg, raise_on_extra_warnings=not warn_copy_on_write ): result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c70995de7b3b2..cded7a71458fa 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -127,7 +127,7 @@ def f(x): return x.drop_duplicates("person_name").iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") @@ -335,7 +335,7 @@ def test_apply(ordered): idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) @@ -2053,7 +2053,7 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = DeprecationWarning if method == "apply" and index_kind == "range" else None + warn = FutureWarning if method == "apply" and index_kind == "range" else None msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 2622895f9f8d2..16d7fe61b90ad 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -290,7 +290,7 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 586ef8a126536..57e691b3c508d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -166,7 +166,7 @@ def max_value(group): return group.loc[group["value"].idxmax()] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): applied = df.groupby("A").apply(max_value) result = applied.dtypes expected = df.dtypes @@ -189,7 +189,7 @@ def f_0(grp): expected = df.groupby("A").first()[["B"]] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) @@ -199,7 +199,7 @@ def f_1(grp): return grp.iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").apply(f_1)[["B"]] e = expected.copy() e.loc["Tiger"] = np.nan @@ -211,7 +211,7 @@ def f_2(grp): return grp.iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").apply(f_2)[["B"]] e = expected.copy() e.loc["Pony"] = np.nan @@ -224,7 +224,7 @@ def f_3(grp): return grp.iloc[0] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT @@ -237,7 +237,7 @@ def f_4(grp): return grp.iloc[0].loc["C"] msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan @@ -424,9 +424,9 @@ def f3(x): # correct result msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result1 = df.groupby("a").apply(f1) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) @@ -1379,13 +1379,13 @@ def summarize_random_name(df): return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1681,7 +1681,7 @@ def test_dont_clobber_name_column(): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("key", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -1769,7 +1769,7 @@ def freducex(x): # make sure all these work msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) @@ -1792,7 +1792,7 @@ def f(group): return group.copy() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] @@ -2000,7 +2000,7 @@ def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): g.apply(test_sort) @@ -2187,7 +2187,7 @@ def test_empty_groupby_apply_nonunique_columns(): df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): res = gb.apply(lambda x: x) assert (res.dtypes == df.dtypes).all() diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 9c01e017dd29c..7e65e56abc4c9 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -328,7 +328,7 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 0832b67b38098..1a2acb658ee26 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -74,7 +74,7 @@ def func(group): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning( - DeprecationWarning, + FutureWarning, match=msg, raise_on_extra_warnings=False, check_stacklevel=False, @@ -126,7 +126,7 @@ def test_groupby_resample_preserves_subclass(obj): # Confirm groupby.resample() preserves dataframe type msg = "DataFrameGroupBy.resample operated on the grouping columns" with tm.assert_produces_warning( - DeprecationWarning, + FutureWarning, match=msg, raise_on_extra_warnings=False, check_stacklevel=False, diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 6e3ae2f7d8fae..7c0a4b78a123d 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -240,7 +240,7 @@ def test_grouper_creation_bug(self): tm.assert_frame_equal(result, expected) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = g.apply(lambda x: x.sum()) expected["A"] = [0, 2, 4] expected = expected.loc[:, ["A", "B"]] diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 69542b934e65f..92dfe146bbb54 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -481,10 +481,10 @@ def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) @@ -502,9 +502,9 @@ def sumfunc_value(x): return x.value.sum() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) @@ -932,7 +932,7 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( # function that returns a Series msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): res = gb.apply(lambda x: x["Quantity"] * 2) dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 395036dd400e5..690eb6f410798 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -683,7 +683,7 @@ def f(group): grouped = df.groupby("c") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = grouped.apply(f) assert result["d"].dtype == np.float64 @@ -841,7 +841,7 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): warn = None else: - warn = DeprecationWarning + warn = FutureWarning msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(warn, match=msg): expected = gb.apply(targop) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index ddd81ab1d347d..80583f5d3c5f2 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1080,10 +1080,10 @@ def test_resample_segfault(unit): ).set_index("timestamp") df.index = df.index.as_unit(unit) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("ID").resample("5min").sum() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1104,7 +1104,7 @@ def test_resample_dtype_preservation(unit): assert result.val.dtype == np.int32 msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1881,10 +1881,10 @@ def f(data, add_arg): df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 12abd1c98784b..af4cf5d4ebae5 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -78,7 +78,7 @@ def test_groupby_resample_api(): index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 32567b4300152..e2d456fea2b23 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas as pd @@ -72,10 +70,10 @@ def f_0(x): return x.set_index("date").resample("D").asfreq() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = df.groupby("id").apply(f_0) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) @@ -91,10 +89,10 @@ def f_1(x): return x.resample("1D").ffill() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = df.groupby("group").apply(f_1) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -111,7 +109,7 @@ def test_getitem(test_frame): tm.assert_series_equal(result, expected) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -237,10 +235,10 @@ def test_methods(f, test_frame): r = g.resample("2s") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = getattr(r, f)() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -259,10 +257,10 @@ def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = getattr(r, f)(ddof=1) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -273,14 +271,14 @@ def test_apply(test_frame): # reduction msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = r.apply(f_0) tm.assert_frame_equal(result, expected) @@ -288,7 +286,7 @@ def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") @@ -358,7 +356,7 @@ def test_resample_groupby_with_label(unit): index = date_range("2000-01-01", freq="2D", periods=5, unit=unit) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("col0").resample("1W", label="left").sum() mi = [ @@ -381,7 +379,7 @@ def test_consistency_with_window(test_frame): df = test_frame expected = Index([1, 2, 3], name="A") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -481,7 +479,7 @@ def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = ( DataFrame(columns=["a", "b"]) @@ -494,7 +492,6 @@ def test_empty(keys): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("consolidate", [True, False]) def test_resample_groupby_agg_object_dtype_all_nan(consolidate): # https://github.com/pandas-dev/pandas/issues/39329 @@ -507,7 +504,7 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): df = df._consolidate() msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ @@ -559,7 +556,7 @@ def test_resample_no_index(keys): df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) @@ -608,7 +605,7 @@ def test_groupby_resample_size_all_index_same(): index=date_range("31/12/2000 18:00", freq="h", periods=12), ) msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").resample("D").size() mi_exp = pd.MultiIndex.from_arrays( diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 3f9340b800eae..3d9098917a12d 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -346,7 +346,7 @@ def test_groupby_resample_interpolate(): df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = ( df.set_index("week_starting") .groupby("volume") diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 45e7e07affd75..400bf10817ab8 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -101,7 +101,7 @@ def test_rolling(self, f, roll_frame): result = getattr(r, f)() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -117,7 +117,7 @@ def test_rolling_ddof(self, f, roll_frame): result = getattr(r, f)(ddof=1) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -135,7 +135,7 @@ def test_rolling_quantile(self, interpolation, roll_frame): result = r.quantile(0.4, interpolation=interpolation) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply( lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) ) @@ -182,7 +182,7 @@ def func(x): return getattr(x.rolling(4), f)(roll_frame) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) @@ -200,7 +200,7 @@ def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(func) tm.assert_series_equal(result, expected) @@ -247,7 +247,7 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -793,11 +793,11 @@ def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @@ -975,7 +975,7 @@ def test_groupby_monotonic(self): df = df.sort_values("date") msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = ( df.set_index("date") .groupby("name") @@ -1000,7 +1000,7 @@ def test_datelike_on_monotonic_within_each_group(self): ) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = ( df.set_index("B") .groupby("A") @@ -1036,7 +1036,7 @@ def test_expanding(self, f, frame): result = getattr(r, f)() msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(lambda x: getattr(x.expanding(), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -1052,7 +1052,7 @@ def test_expanding_ddof(self, f, frame): result = getattr(r, f)(ddof=0) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) @@ -1070,7 +1070,7 @@ def test_expanding_quantile(self, interpolation, frame): result = r.quantile(0.4, interpolation=interpolation) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply( lambda x: x.expanding().quantile(0.4, interpolation=interpolation) ) @@ -1092,7 +1092,7 @@ def func_0(x): return getattr(x.expanding(), f)(frame) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values @@ -1109,7 +1109,7 @@ def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply(func_1) tm.assert_series_equal(result, expected) @@ -1120,7 +1120,7 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): expected = g.apply( lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) ) From 40793146d61d142c5da6302db43e233fe4649108 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 29 Oct 2024 15:55:35 -0700 Subject: [PATCH 288/396] Backport PR #60089 on branch 2.3.x (BLD: relax meson/meson-python requirements) (#60125) Backport PR #60089: BLD: relax meson/meson-python requirements Co-authored-by: Isuru Fernando --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6cf6caec79c27..9f2c7c0c56295 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,8 +2,8 @@ # Minimum requirements for the build system to execute. # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ - "meson-python==0.13.1", - "meson==1.2.1", + "meson-python>=0.13.1", + "meson>=1.2.1,<2", "wheel", "Cython~=3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json # Force numpy higher than 2.0, so that built wheels are compatible From ba3e93353f452ee0cbedd8c9687a3d19fad15fe2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 30 Oct 2024 11:30:08 +0100 Subject: [PATCH 289/396] [backport 2.3.x] TST (string dtype): duplicate pandas/tests/indexes/object tests specifically for string dtypes (#60117) (#60131) TST (string dtype): duplicate pandas/tests/indexes/object tests specifically for string dtypes (#60117) (cherry picked from commit d8905e4bee2aa0e096ed7831fea7d395d7657120) --- pandas/tests/indexes/object/test_astype.py | 18 --- pandas/tests/indexes/object/test_indexing.py | 83 ++----------- pandas/tests/indexes/string/__init__.py | 0 pandas/tests/indexes/string/test_astype.py | 21 ++++ pandas/tests/indexes/string/test_indexing.py | 118 +++++++++++++++++++ 5 files changed, 149 insertions(+), 91 deletions(-) create mode 100644 pandas/tests/indexes/string/__init__.py create mode 100644 pandas/tests/indexes/string/test_astype.py create mode 100644 pandas/tests/indexes/string/test_indexing.py diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index ce05b5e9f2238..7e0de138aacfb 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -3,25 +3,7 @@ from pandas import ( Index, NaT, - Series, ) -import pandas._testing as tm - - -def test_astype_str_from_bytes(): - # https://github.com/pandas-dev/pandas/issues/38607 - # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively - # did a .decode() on the bytes object. In 2.0 we go through - # ensure_string_array which does f"{val}" - idx = Index(["あ", b"a"], dtype="object") - result = idx.astype(str) - expected = Index(["あ", "a"], dtype="str") - tm.assert_index_equal(result, expected) - - # while we're here, check that Series.astype behaves the same - result = Series(idx).astype(str) - expected = Series(expected, dtype="str") - tm.assert_series_equal(result, expected) def test_astype_invalid_nas_to_tdt64_raises(): diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 57e5c5e3b6abb..d3df349027c00 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -3,12 +3,8 @@ import numpy as np import pytest -from pandas._libs.missing import ( - NA, - is_matching_na, -) +from pandas._libs.missing import is_matching_na -import pandas as pd from pandas import Index import pandas._testing as tm @@ -22,13 +18,14 @@ class TestGetIndexer: ], ) def test_get_indexer_strings(self, method, expected): - index = Index(["b", "c"]) + expected = np.array(expected, dtype=np.intp) + index = Index(["b", "c"], dtype=object) actual = index.get_indexer(["a", "b", "c", "d"], method=method) tm.assert_numpy_array_equal(actual, expected) - def test_get_indexer_strings_raises(self, using_infer_string): - index = Index(["b", "c"]) + def test_get_indexer_strings_raises(self): + index = Index(["b", "c"], dtype=object) msg = "|".join( [ @@ -67,13 +64,9 @@ def test_get_indexer_with_NA_values( class TestGetIndexerNonUnique: - def test_get_indexer_non_unique_nas( - self, nulls_fixture, request, using_infer_string - ): + def test_get_indexer_non_unique_nas(self, nulls_fixture): # even though this isn't non-unique, this should still work - if using_infer_string and (nulls_fixture is None or nulls_fixture is NA): - request.applymarker(pytest.mark.xfail(reason="NAs are cast to NaN")) - index = Index(["a", "b", nulls_fixture]) + index = Index(["a", "b", nulls_fixture], dtype=object) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([2], dtype=np.intp) @@ -82,7 +75,7 @@ def test_get_indexer_non_unique_nas( tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", nulls_fixture, "b", nulls_fixture]) + index = Index(["a", nulls_fixture, "b", nulls_fixture], dtype=object) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([1, 3], dtype=np.intp) @@ -91,10 +84,10 @@ def test_get_indexer_non_unique_nas( # matching-but-not-identical nans if is_matching_na(nulls_fixture, float("NaN")): - index = Index(["a", float("NaN"), "b", float("NaN")]) + index = Index(["a", float("NaN"), "b", float("NaN")], dtype=object) match_but_not_identical = True elif is_matching_na(nulls_fixture, Decimal("NaN")): - index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")]) + index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")], dtype=object) match_but_not_identical = True else: match_but_not_identical = False @@ -155,59 +148,3 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): expected_indexer = np.array([1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) - - -class TestSliceLocs: - @pytest.mark.parametrize( - "in_slice,expected", - [ - # error: Slice index must be an integer or None - (pd.IndexSlice[::-1], "yxdcb"), - (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] - (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] - (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] - (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] - # absent labels - (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] - (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] - (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] - (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] - (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] - (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] - (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] - ], - ) - def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): - index = Index(list("bcdxy"), dtype=any_string_dtype) - - s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) - result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected), dtype=any_string_dtype) - tm.assert_index_equal(result, expected) - - def test_slice_locs_negative_step_oob(self, any_string_dtype): - index = Index(list("bcdxy"), dtype=any_string_dtype) - - result = index[-10:5:1] - tm.assert_index_equal(result, index) - - result = index[4:-10:-1] - expected = Index(list("yxdcb"), dtype=any_string_dtype) - tm.assert_index_equal(result, expected) - - def test_slice_locs_dup(self): - index = Index(["a", "a", "b", "c", "d", "d"]) - assert index.slice_locs("a", "d") == (0, 6) - assert index.slice_locs(end="d") == (0, 6) - assert index.slice_locs("a", "c") == (0, 4) - assert index.slice_locs("b", "d") == (2, 6) - - index2 = index[::-1] - assert index2.slice_locs("d", "a") == (0, 6) - assert index2.slice_locs(end="a") == (0, 6) - assert index2.slice_locs("d", "b") == (0, 4) - assert index2.slice_locs("c", "a") == (2, 6) diff --git a/pandas/tests/indexes/string/__init__.py b/pandas/tests/indexes/string/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/string/test_astype.py b/pandas/tests/indexes/string/test_astype.py new file mode 100644 index 0000000000000..0349d85f23167 --- /dev/null +++ b/pandas/tests/indexes/string/test_astype.py @@ -0,0 +1,21 @@ +from pandas import ( + Index, + Series, +) +import pandas._testing as tm + + +def test_astype_str_from_bytes(): + # https://github.com/pandas-dev/pandas/issues/38607 + # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively + # did a .decode() on the bytes object. In 2.0 we go through + # ensure_string_array which does f"{val}" + idx = Index(["あ", b"a"], dtype="object") + result = idx.astype(str) + expected = Index(["あ", "a"], dtype="str") + tm.assert_index_equal(result, expected) + + # while we're here, check that Series.astype behaves the same + result = Series(idx).astype(str) + expected = Series(expected, dtype="str") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py new file mode 100644 index 0000000000000..755b7109a5a04 --- /dev/null +++ b/pandas/tests/indexes/string/test_indexing.py @@ -0,0 +1,118 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index +import pandas._testing as tm + + +class TestGetIndexer: + @pytest.mark.parametrize( + "method,expected", + [ + ("pad", [-1, 0, 1, 1]), + ("backfill", [0, 0, 1, -1]), + ], + ) + def test_get_indexer_strings(self, any_string_dtype, method, expected): + expected = np.array(expected, dtype=np.intp) + index = Index(["b", "c"], dtype=any_string_dtype) + actual = index.get_indexer(["a", "b", "c", "d"], method=method) + + tm.assert_numpy_array_equal(actual, expected) + + def test_get_indexer_strings_raises(self, any_string_dtype): + index = Index(["b", "c"], dtype=any_string_dtype) + + msg = "|".join( + [ + "operation 'sub' not supported for dtype 'str", + r"unsupported operand type\(s\) for -: 'str' and 'str'", + ] + ) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") + + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) + + +class TestGetIndexerNonUnique: + @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) + def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", None], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + # actually non-unique + index = Index(["a", None, "b", None], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + +class TestSliceLocs: + @pytest.mark.parametrize( + "in_slice,expected", + [ + # error: Slice index must be an integer or None + (pd.IndexSlice[::-1], "yxdcb"), + (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] + (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] + (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] + (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] + # absent labels + (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] + (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] + (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] + (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] + (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] + (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] + (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] + ], + ) + def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) + + s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) + result = index[s_start : s_stop : in_slice.step] + expected = Index(list(expected), dtype=any_string_dtype) + tm.assert_index_equal(result, expected) + + def test_slice_locs_negative_step_oob(self, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype=any_string_dtype) + tm.assert_index_equal(result, expected) + + def test_slice_locs_dup(self, any_string_dtype): + index = Index(["a", "a", "b", "c", "d", "d"], dtype=any_string_dtype) + assert index.slice_locs("a", "d") == (0, 6) + assert index.slice_locs(end="d") == (0, 6) + assert index.slice_locs("a", "c") == (0, 4) + assert index.slice_locs("b", "d") == (2, 6) + + index2 = index[::-1] + assert index2.slice_locs("d", "a") == (0, 6) + assert index2.slice_locs(end="a") == (0, 6) + assert index2.slice_locs("d", "b") == (0, 4) + assert index2.slice_locs("c", "a") == (2, 6) From d4b3aa5127f2ab7aac94f7f2db20db099ef8676e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 30 Oct 2024 13:54:19 -0700 Subject: [PATCH 290/396] Backport PR #60035 on branch 2.3.x (Bump pypa/cibuildwheel from 2.21.0 to 2.21.3) (#60147) Backport PR #60035: Bump pypa/cibuildwheel from 2.21.0 to 2.21.3 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 8d7706042718b..c06146b8e67f6 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -150,7 +150,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.21.0 + uses: pypa/cibuildwheel@v2.21.3 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 409837ad33d1affae4200f603a30ed5ec4249327 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 31 Oct 2024 10:40:40 +0100 Subject: [PATCH 291/396] [backport 2.3.x] CI: remove usage of legacy NPY_PROMOTION_STATE (#60144) (#60153) CI: remove usage of legacy NPY_PROMOTION_STATE (#60144) (cherry picked from commit 4bbb3ce5d5a7e0f24dc2d8c1faf26c3b5d55670d) --- .github/workflows/unit-tests.yml | 1 - ci/deps/actions-311-pyarrownightly.yaml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index bd7da3a804634..4a2c412d2d98e 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -117,7 +117,6 @@ jobs: TEST_ARGS: ${{ matrix.test_args || '' }} PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }} PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} - NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }} # Clipboard tests QT_QPA_PLATFORM: offscreen REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index b90fa2e044cd6..978611b3fa96f 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -18,7 +18,7 @@ dependencies: # required dependencies - python-dateutil - - numpy<2 + - numpy # pytz 2024.2 timezones cause wrong results - pytz<2024.2 - pip From fa7c87b5b58f40a1ee55b872845ba0e0458f55ad Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 31 Oct 2024 10:41:32 +0100 Subject: [PATCH 292/396] [backport 2.3.x] CI/TST: fix parquet tz test returning pytz fixed offset (pyarrow 18) (#60143) (#60151) CI/TST: fix parquet tz test returning pytz fixed offset (pyarrow 18) (#60143) * CI/TST: fix parquet tz test returning pytz fixed offset (pyarrow 18) * only convert to pytz if installed (cherry picked from commit 9cd4a281c42838cd32261b92a55aed830ebeae03) --- pandas/tests/io/test_parquet.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 746ca3cf6534d..e43aae6a2e9e7 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -993,16 +993,9 @@ def test_timestamp_nanoseconds(self, pa): df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": ver}) - def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): + def test_timezone_aware_index(self, pa, timezone_aware_date_list): pytest.importorskip("pyarrow", "11.0.0") - if timezone_aware_date_list.tzinfo != datetime.timezone.utc: - request.applymarker( - pytest.mark.xfail( - reason="temporary skip this test until it is properly resolved: " - "https://github.com/pandas-dev/pandas/issues/37286" - ) - ) idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) @@ -1015,7 +1008,23 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - check_round_trip(df, pa, check_dtype=False) + + expected = df[:] + if pa_version_under11p0: + expected.index = expected.index.as_unit("ns") + if timezone_aware_date_list.tzinfo != datetime.timezone.utc: + # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone + # https://github.com/pandas-dev/pandas/issues/37286 + try: + import pytz + except ImportError: + pass + else: + offset = df.index.tz.utcoffset(timezone_aware_date_list) + tz = pytz.FixedOffset(offset.total_seconds() / 60) + expected.index = expected.index.tz_convert(tz) + expected["index_as_col"] = expected["index_as_col"].dt.tz_convert(tz) + check_round_trip(df, pa, check_dtype=False, expected=expected) def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 From e620e9dce4a40b46e768cca74220735852516223 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 31 Oct 2024 12:16:13 +0100 Subject: [PATCH 293/396] [backport 2.3.x] BUG/TST (string dtype): fix and update tests for Stata IO (#60130) (#60155) BUG/TST (string dtype): fix and update tests for Stata IO (#60130) (cherry picked from commit e7d54a54da8a179fbde5878dfb4e6440d0cfbac8) --- pandas/io/stata.py | 5 +++ pandas/tests/io/test_stata.py | 82 ++++++++++++++++++----------------- 2 files changed, 48 insertions(+), 39 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4abf9af185a01..b5057a6681638 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -605,7 +605,11 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: if getattr(data[col].dtype, "numpy_dtype", None) is not None: data[col] = data[col].astype(data[col].dtype.numpy_dtype) elif is_string_dtype(data[col].dtype): + # TODO could avoid converting string dtype to object here, + # but handle string dtype in _encode_strings data[col] = data[col].astype("object") + # generate_table checks for None values + data.loc[data[col].isna(), col] = None dtype = data[col].dtype empty_df = data.shape[0] == 0 @@ -2671,6 +2675,7 @@ def _encode_strings(self) -> None: continue column = self.data[col] dtype = column.dtype + # TODO could also handle string dtype here specifically if dtype.type is np.object_: inferred_dtype = infer_dtype(column, skipna=True) if not ((inferred_dtype == "string") or len(column) == 0): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 09509fb495034..32f1c8d65271b 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -347,9 +345,8 @@ def test_write_dta6(self, datapath): check_index_type=False, ) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_write_dta10(self, version): + def test_read_write_dta10(self, version, using_infer_string): original = DataFrame( data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], columns=["string", "object", "integer", "floating", "datetime"], @@ -362,12 +359,17 @@ def test_read_write_dta10(self, version): with tm.ensure_clean() as path: original.to_stata(path, convert_dates={"datetime": "tc"}, version=version) written_and_read_again = self.read_dta(path) - # original.index is np.int32, read index is np.int64 - tm.assert_frame_equal( - written_and_read_again.set_index("index"), - original, - check_index_type=False, - ) + + expected = original.copy() + if using_infer_string: + expected["object"] = expected["object"].astype("str") + + # original.index is np.int32, read index is np.int64 + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + expected, + check_index_type=False, + ) def test_stata_doc_examples(self): with tm.ensure_clean() as path: @@ -1153,7 +1155,6 @@ def test_categorical_ordering(self, file, datapath): assert parsed[col].cat.ordered assert not parsed_unordered[col].cat.ordered - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1215,6 +1216,10 @@ def _convert_categorical(from_frame: DataFrame) -> DataFrame: if cat.categories.dtype == object: categories = pd.Index._with_infer(cat.categories._values) cat = cat.set_categories(categories) + elif cat.categories.dtype == "string" and len(cat.categories) == 0: + # if the read categories are empty, it comes back as object dtype + categories = cat.categories.astype(object) + cat = cat.set_categories(categories) from_frame[col] = cat return from_frame @@ -1244,7 +1249,6 @@ def test_iterator(self, datapath): from_chunks = pd.concat(itr) tm.assert_frame_equal(parsed, from_chunks) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1548,12 +1552,11 @@ def test_inf(self, infval): with tm.ensure_clean() as path: df.to_stata(path) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_pathlib(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" reader = lambda x: read_stata(x).set_index("index") @@ -1584,13 +1587,12 @@ def test_value_labels_iterator(self, write_index): value_labels = dta_iter.value_labels() assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}} - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_set_index(self): # GH 17328 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" with tm.ensure_clean() as path: @@ -1618,8 +1620,7 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_writer_117(self): + def test_writer_117(self, using_infer_string): original = DataFrame( data=[ [ @@ -1682,13 +1683,17 @@ def test_writer_117(self): version=117, ) written_and_read_again = self.read_dta(path) - # original.index is np.int32, read index is np.int64 - tm.assert_frame_equal( - written_and_read_again.set_index("index"), - original, - check_index_type=False, - ) - tm.assert_frame_equal(original, copy) + + expected = original[:] + if using_infer_string: + # object dtype (with only strings/None) comes back as string dtype + expected["object"] = expected["object"].astype("str") + + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + expected, + ) + tm.assert_frame_equal(original, copy) def test_convert_strl_name_swap(self): original = DataFrame( @@ -1725,15 +1730,14 @@ def test_invalid_date_conversion(self): with pytest.raises(ValueError, match=msg): original.to_stata(path, convert_dates={"wrong_name": "tc"}) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nonfile_writing(self, version): # GH 21041 bio = io.BytesIO() df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" with tm.ensure_clean() as path: @@ -1744,13 +1748,12 @@ def test_nonfile_writing(self, version): reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_gzip_writing(self): # writing version 117 requires seek and cannot be used with gzip df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" with tm.ensure_clean() as path: @@ -1777,8 +1780,7 @@ def test_unicode_dta_118(self, datapath): tm.assert_frame_equal(unicode_df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_mixed_string_strl(self): + def test_mixed_string_strl(self, using_infer_string): # GH 23633 output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}] output = DataFrame(output) @@ -1796,7 +1798,10 @@ def test_mixed_string_strl(self): path, write_index=False, convert_strl=["mixed"], version=117 ) reread = read_stata(path) - expected = output.fillna("") + expected = output.copy() + if using_infer_string: + expected["mixed"] = expected["mixed"].astype("str") + expected = expected.fillna("") tm.assert_frame_equal(reread, expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @@ -1875,7 +1880,7 @@ def test_stata_119(self, datapath): reader._ensure_open() assert reader._nvar == 32999 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.filterwarnings("ignore:Downcasting behavior:FutureWarning") @pytest.mark.parametrize("version", [118, 119, None]) def test_utf8_writer(self, version): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) @@ -2143,14 +2148,13 @@ def test_iterator_errors(datapath, chunksize): pass -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_iterator_value_labels(): # GH 31544 values = ["c_label", "b_label"] + ["a_label"] * 500 df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)}) with tm.ensure_clean() as path: df.to_stata(path, write_index=False) - expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") + expected = pd.Index(["a_label", "b_label", "c_label"]) with read_stata(path, chunksize=100) as reader: for j, chunk in enumerate(reader): for i in range(2): From 4f189a4b90f7378a5361308c4a9ef91aa0c16299 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 31 Oct 2024 13:54:44 +0100 Subject: [PATCH 294/396] [backport 2.3.x] String dtype: implement sum reduction (#59853) (#60157) String dtype: implement sum reduction (#59853) (cherry picked from commit 2fdb16b347fc34f78213868a8a973447ac79ab2d) --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/array_algos/masked_reductions.py | 4 ++ pandas/core/arrays/arrow/array.py | 32 ++++++++++ pandas/core/arrays/string_.py | 18 +++++- pandas/core/arrays/string_arrow.py | 6 +- pandas/tests/apply/test_frame_apply.py | 10 --- pandas/tests/apply/test_invalid_arg.py | 39 ++++++------ pandas/tests/arrays/string_/test_string.py | 2 - pandas/tests/extension/test_arrow.py | 26 ++------ pandas/tests/extension/test_string.py | 2 +- pandas/tests/frame/test_reductions.py | 63 ++++++------------- pandas/tests/groupby/test_groupby.py | 15 +---- pandas/tests/groupby/test_raises.py | 3 +- .../tests/groupby/transform/test_transform.py | 11 +--- pandas/tests/series/test_reductions.py | 38 ++++------- 15 files changed, 121 insertions(+), 150 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 00503766b062f..cc561a888f843 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -32,7 +32,7 @@ enhancement1 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 335fa1afc0f4e..6bf97729a79b1 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -62,6 +62,10 @@ def _reductions( ): return libmissing.NA + if values.dtype == np.dtype(object): + # object dtype does not support `where` without passing an initial + values = values[~mask] + return func(values, axis=axis, **kwargs) return func(values, where=~mask, axis=axis, **kwargs) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f3d7a3cc6d694..51136961c0fb3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -69,6 +69,7 @@ unpack_tuple_and_ellipses, validate_indices, ) +from pandas.core.nanops import check_below_min_count from pandas.core.strings.base import BaseStringArrayMethods from pandas.io._util import _arrow_dtype_mapping @@ -1694,6 +1695,37 @@ def pyarrow_meth(data, skip_nulls, **kwargs): denominator = pc.sqrt_checked(pc.count(self._pa_array)) return pc.divide_checked(numerator, denominator) + elif name == "sum" and ( + pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type) + ): + + def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc] + mask = pc.is_null(data) if data.null_count > 0 else None + if skip_nulls: + if min_count > 0 and check_below_min_count( + (len(data),), + None if mask is None else mask.to_numpy(), + min_count, + ): + return pa.scalar(None, type=data.type) + if data.null_count > 0: + # binary_join returns null if there is any null -> + # have to filter out any nulls + data = data.filter(pc.invert(mask)) + else: + if mask is not None or check_below_min_count( + (len(data),), None, min_count + ): + return pa.scalar(None, type=data.type) + + if pa.types.is_large_string(data.type): + # binary_join only supports string, not large_string + data = data.cast(pa.string()) + data_list = pa.ListArray.from_arrays( + [0, len(data)], data.combine_chunks() + )[0] + return pc.binary_join(data_list, "") + else: pyarrow_name = { "median": "quantile", diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 5b69344bac0c8..faad516b53a4a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -812,8 +812,8 @@ def _reduce( else: return nanops.nanall(self._ndarray, skipna=skipna) - if name in ["min", "max"]: - result = getattr(self, name)(skipna=skipna, axis=axis) + if name in ["min", "max", "sum"]: + result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs) if keepdims: return self._from_sequence([result], dtype=self.dtype) return result @@ -839,6 +839,20 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: ) return self._wrap_reduction_result(axis, result) + def sum( + self, + *, + axis: AxisInt | None = None, + skipna: bool = True, + min_count: int = 0, + **kwargs, + ) -> Scalar: + nv.validate_sum((), kwargs) + result = masked_reductions.sum( + values=self._ndarray, mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + def value_counts(self, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9389f7cffca9f..e7dd4f9dc5718 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -430,7 +430,11 @@ def _reduce( return result.astype(np.bool_) return result - result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if name in ("min", "max", "sum", "argmin", "argmax"): + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + else: + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + if name in ("argmin", "argmax") and isinstance(result, pa.Array): return self._convert_int_result(result) elif isinstance(result, pa.Array): diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 6a328dfb39be5..b7eac6b8f0ea1 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,10 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -1173,7 +1169,6 @@ def test_agg_with_name_as_column_name(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_agg_multiple_mixed(): # GH 20909 mdf = DataFrame( @@ -1202,9 +1197,6 @@ def test_agg_multiple_mixed(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_agg_multiple_mixed_raises(): # GH 20909 mdf = DataFrame( @@ -1294,7 +1286,6 @@ def test_agg_reduce(axis, float_frame): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nuiscance_columns(): # GH 15015 df = DataFrame( @@ -1471,7 +1462,6 @@ def test_apply_datetime_tz_issue(engine, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) def test_mixed_column_raises(df, method, using_infer_string): diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 1c5b170c8753f..8963265b0a800 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -12,9 +12,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW from pandas.errors import SpecificationError from pandas import ( @@ -212,10 +209,6 @@ def transform(row): data.apply(transform, axis=1) -# we should raise a proper TypeError instead of propagating the pyarrow error -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) @pytest.mark.parametrize( "df, func, expected", tm.get_cython_table_params( @@ -225,21 +218,25 @@ def transform(row): def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 if using_infer_string: - import pyarrow as pa + if df.dtypes.iloc[0].storage == "pyarrow": + import pyarrow as pa - expected = (expected, pa.lib.ArrowNotImplementedError) + # TODO(infer_string) + # should raise a proper TypeError instead of propagating the pyarrow error - msg = "can't multiply sequence by non-int of type 'str'|has no kernel" + expected = (expected, pa.lib.ArrowNotImplementedError) + else: + expected = (expected, NotImplementedError) + + msg = ( + "can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform" + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"): df.agg(func, axis=axis) -# we should raise a proper TypeError instead of propagating the pyarrow error -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) @pytest.mark.parametrize( "series, func, expected", chain( @@ -263,11 +260,15 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" if using_infer_string: - import pyarrow as pa - - expected = (expected, pa.lib.ArrowNotImplementedError) - - msg = msg + "|does not support|has no kernel" + if series.dtype.storage == "pyarrow": + import pyarrow as pa + + # TODO(infer_string) + # should raise a proper TypeError instead of propagating the pyarrow error + expected = (expected, pa.lib.ArrowNotImplementedError) + else: + expected = (expected, NotImplementedError) + msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform" warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 265b9fc40629b..73e8bde827d50 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -444,7 +444,6 @@ def test_astype_float(dtype, any_float_dtype): @pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce(skipna, dtype): arr = pd.Series(["a", "b", "c"], dtype=dtype) result = arr.sum(skipna=skipna) @@ -452,7 +451,6 @@ def test_reduce(skipna, dtype): @pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce_missing(skipna, dtype): arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype) result = arr.sum(skipna=skipna) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 60e7bd83432c5..0ce7a66e0e00c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -459,10 +459,11 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: pass else: return False + elif pa.types.is_binary(pa_dtype) and op_name == "sum": + return False elif ( pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ) and op_name in [ - "sum", "mean", "median", "prod", @@ -553,6 +554,7 @@ def test_reduce_series_boolean( return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + pa_type = arr._pa_array.type if op_name in ["max", "min"]: cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": @@ -562,6 +564,8 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): cmp_dtype = "float64[pyarrow]" elif op_name in ["median", "var", "std", "mean", "skew"]: cmp_dtype = "float64[pyarrow]" + elif op_name == "sum" and pa.types.is_string(pa_type): + cmp_dtype = arr.dtype else: cmp_dtype = { "i": "int64[pyarrow]", @@ -585,26 +589,6 @@ def test_median_not_approximate(self, typ): result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median() assert result == 1.5 - def test_in_numeric_groupby(self, data_for_grouping): - dtype = data_for_grouping.dtype - if is_string_dtype(dtype): - df = pd.DataFrame( - { - "A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping, - "C": [1, 1, 1, 1, 1, 1, 1, 1], - } - ) - - expected = pd.Index(["C"]) - msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}") - with pytest.raises(TypeError, match=msg): - df.groupby("A").sum() - result = df.groupby("A").sum(numeric_only=True).columns - tm.assert_index_equal(result, expected) - else: - super().test_in_numeric_groupby(data_for_grouping) - def test_construct_from_string_own_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 07c3b4224e76f..6af095e33396c 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -191,7 +191,7 @@ def _get_expected_exception( def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( - op_name in ["min", "max"] + op_name in ["min", "max", "sum"] or ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index a4263279a7bd5..824d53c8d5d13 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -226,7 +226,6 @@ def float_frame_with_na(): class TestDataFrameAnalytics: # --------------------------------------------------------------------- # Reductions - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "opname", @@ -246,17 +245,11 @@ class TestDataFrameAnalytics: pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) - def test_stat_op_api_float_string_frame( - self, float_string_frame, axis, opname, using_infer_string - ): - if ( - (opname in ("sum", "min", "max") and axis == 0) - or opname - in ( - "count", - "nunique", - ) - ) and not (using_infer_string and opname == "sum"): + def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): + if (opname in ("sum", "min", "max") and axis == 0) or opname in ( + "count", + "nunique", + ): getattr(float_string_frame, opname)(axis=axis) else: if opname in ["var", "std", "sem", "skew", "kurt"]: @@ -283,10 +276,11 @@ def test_stat_op_api_float_string_frame( msg = "'[><]=' not supported between instances of 'float' and 'str'" elif opname == "median": msg = re.compile( - r"Cannot convert \[.*\] to numeric|does not support", flags=re.S + r"Cannot convert \[.*\] to numeric|does not support|Cannot perform", + flags=re.S, ) if not isinstance(msg, re.Pattern): - msg = msg + "|does not support" + msg = msg + "|does not support|Cannot perform reduction" with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -432,7 +426,6 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis): expected[expected.isna()] = None tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 @@ -449,26 +442,16 @@ def test_mixed_ops(self, op): "could not convert", "can't multiply sequence by non-int", "does not support", + "Cannot perform", ] ) with pytest.raises(TypeError, match=msg): getattr(df, op)() with pd.option_context("use_bottleneck", False): - msg = "|".join( - [ - "Could not convert", - "could not convert", - "can't multiply sequence by non-int", - "does not support", - ] - ) with pytest.raises(TypeError, match=msg): getattr(df, op)() - @pytest.mark.xfail( - using_string_dtype(), reason="sum doesn't work for arrow strings" - ) def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( @@ -610,7 +593,6 @@ def test_sem(self, datetime_frame): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, expected", [ @@ -632,7 +614,7 @@ def test_sem(self, datetime_frame): "A": [12], "B": [10.0], "C": [np.nan], - "D": np.array([np.nan], dtype=object), + "D": Series([np.nan], dtype="str"), "E": Categorical([np.nan], categories=["a"]), "F": DatetimeIndex([pd.NaT], dtype="M8[ns]"), "G": to_timedelta([pd.NaT]), @@ -674,7 +656,7 @@ def test_mode_dropna(self, dropna, expected): "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], - "D": Series([np.nan, np.nan, "a", np.nan], dtype=object), + "D": Series([np.nan, np.nan, "a", np.nan], dtype="str"), "E": Categorical([np.nan, np.nan, "a", np.nan]), "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), @@ -694,7 +676,6 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_mode_sortwarning(self, using_infer_string): # Check for the warning that is raised when the mode # results cannot be sorted @@ -702,8 +683,13 @@ def test_mode_sortwarning(self, using_infer_string): df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - warning = None if using_infer_string else UserWarning - with tm.assert_produces_warning(warning): + # TODO(infer_string) avoid this UserWarning for python storage + warning = ( + None + if using_infer_string and df.A.dtype.storage == "pyarrow" + else UserWarning + ) + with tm.assert_produces_warning(warning, match="Unable to sort modes"): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) @@ -1367,13 +1353,10 @@ def test_any_all_extra(self): result = df[["C"]].all(axis=None).item() assert result is True - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) - def test_any_all_object_dtype( - self, axis, bool_agg_func, skipna, using_infer_string - ): + def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): # GH#35450 df = DataFrame( data=[ @@ -1383,13 +1366,8 @@ def test_any_all_object_dtype( [np.nan, np.nan, "5", np.nan], ] ) - if using_infer_string: - # na in object is True while in string pyarrow numpy it's false - val = not axis == 0 and not skipna and bool_agg_func == "all" - else: - val = True result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) - expected = Series([True, True, val, True]) + expected = Series([True, True, True, True]) tm.assert_series_equal(result, expected) # GH#50947 deprecates this but it is not emitting a warning in some builds. @@ -1969,7 +1947,6 @@ def test_sum_timedelta64_skipna_false(using_array_manager, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="sum doesn't work with arrow strings") def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 57e691b3c508d..13269ea9c0920 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -8,15 +8,12 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW from pandas.errors import ( PerformanceWarning, SpecificationError, ) import pandas.util._test_decorators as td -from pandas.core.dtypes.common import is_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -1744,23 +1741,15 @@ def g(group): tm.assert_series_equal(result, expected) -# TODO harmonize error messages -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) -def test_set_group_name(df, grouper, using_infer_string): +def test_set_group_name(df, grouper): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None - if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): - with pytest.raises(TypeError, match="does not support"): - group.sum() - else: - return group.sum() + return group.sum() def freducex(x): return freduce(x) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index d5b7a3f25d0eb..4ebb26b0289ec 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -219,7 +219,6 @@ def func(x): getattr(gb, how)(func) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( @@ -236,7 +235,7 @@ def test_groupby_raises_string_np( np.sum: (None, ""), np.mean: ( TypeError, - re.escape("agg function failed [how->mean,dtype->object]"), + "agg function failed|Cannot perform reduction 'mean' with string dtype", ), }[groupby_func_np] diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 690eb6f410798..2aada753e27f4 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -5,7 +5,6 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ensure_platform_int @@ -512,10 +511,7 @@ def test_transform_nuisance_raises(df, using_infer_string): gbc = grouped["B"] msg = "Could not convert" if using_infer_string: - if df.columns.dtype.storage == "pyarrow": - msg = "with dtype str does not support reduction 'mean'" - else: - msg = "Cannot perform reduction 'mean' with string dtype" + msg = "Cannot perform reduction 'mean' with string dtype" with pytest.raises(TypeError, match=msg): gbc.transform(lambda x: np.mean(x)) @@ -620,10 +616,7 @@ def test_groupby_transform_with_int(using_infer_string): ) msg = "Could not convert" if using_infer_string: - if HAS_PYARROW: - msg = "with dtype str does not support reduction 'mean'" - else: - msg = "Cannot perform reduction 'mean' with string dtype" + msg = "Cannot perform reduction 'mean' with string dtype" with np.errstate(all="ignore"): with pytest.raises(TypeError, match=msg): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index fcae835e4c3e2..5415f220cadd4 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import Series import pandas._testing as tm @@ -167,65 +163,55 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_mean_with_convertible_string_raises(using_array_manager, using_infer_string): # GH#44008 ser = Series(["1", "2"]) - if using_infer_string: - msg = "does not support" - with pytest.raises(TypeError, match=msg): - ser.sum() - else: - assert ser.sum() == "12" - msg = "Could not convert string '12' to numeric|does not support" + assert ser.sum() == "12" + + msg = "Could not convert string '12' to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): ser.mean() df = ser.to_frame() if not using_array_manager: - msg = r"Could not convert \['12'\] to numeric|does not support" + msg = r"Could not convert \['12'\] to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df.mean() -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_mean_dont_convert_j_to_complex(using_array_manager): # GH#36703 df = pd.DataFrame([{"db": "J", "numeric": 123}]) if using_array_manager: msg = "Could not convert string 'J' to numeric" else: - msg = r"Could not convert \['J'\] to numeric|does not support" + msg = r"Could not convert \['J'\] to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df.mean() with pytest.raises(TypeError, match=msg): df.agg("mean") - msg = "Could not convert string 'J' to numeric|does not support" + msg = "Could not convert string 'J' to numeric|does not support|Cannot perform" with pytest.raises(TypeError, match=msg): df["db"].mean() - msg = "Could not convert string 'J' to numeric|ufunc 'divide'" + msg = "Could not convert string 'J' to numeric|ufunc 'divide'|Cannot perform" with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) def test_median_with_convertible_string_raises(using_array_manager): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 - msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" + msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support|Cannot perform" ser = Series(["1", "2", "3"]) with pytest.raises(TypeError, match=msg): ser.median() if not using_array_manager: - msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" + msg = ( + r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" + "|Cannot perform" + ) df = ser.to_frame() with pytest.raises(TypeError, match=msg): df.median() From 018a38e402c7222adc2cecff46f1e98de9f7edd5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 31 Oct 2024 08:55:42 -0700 Subject: [PATCH 295/396] Backport PR #60133 on branch 2.3.x (TST (string dtype): update tests/reductions tests) (#60158) Backport PR #60133: TST (string dtype): update tests/reductions tests Co-authored-by: Joris Van den Bossche --- pandas/tests/reductions/test_reductions.py | 43 +++++++++++++++------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index c58db25991510..ed2b01b09bb71 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1246,6 +1246,7 @@ def test_idxminmax_object_dtype(self, using_infer_string): with pytest.raises(TypeError, match=msg): ser3.idxmin(skipna=False) + # TODO(infer_string) implement argmin/max for python string dtype @pytest.mark.xfail( using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" ) @@ -1485,12 +1486,14 @@ def test_mode_numerical_nan(self, dropna, expected): expected = Series(expected) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( - "dropna, expected1, expected2, expected3", - [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])], + "dropna, expected1, expected2", + [ + (True, ["b"], ["bar"]), + (False, ["b"], [np.nan]), + ], ) - def test_mode_str_obj(self, dropna, expected1, expected2, expected3): + def test_mode_object(self, dropna, expected1, expected2): # Test string and object types. data = ["a"] * 2 + ["b"] * 3 @@ -1503,17 +1506,32 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): s = Series(data, dtype=object) result = s.mode(dropna) - expected2 = Series(expected2, dtype=None if expected2 == ["bar"] else object) + expected2 = Series(expected2, dtype=object) tm.assert_series_equal(result, expected2) + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [ + (True, ["b"], ["bar"]), + (False, ["b"], [np.nan]), + ], + ) + def test_mode_string(self, dropna, expected1, expected2, any_string_dtype): + # Test string and object types. + data = ["a"] * 2 + ["b"] * 3 + + s = Series(data, dtype=any_string_dtype) + result = s.mode(dropna) + expected1 = Series(expected1, dtype=any_string_dtype) + tm.assert_series_equal(result, expected1) + data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] - s = Series(data, dtype=object).astype(str) + s = Series(data, dtype=any_string_dtype) result = s.mode(dropna) - expected3 = Series(expected3) - tm.assert_series_equal(result, expected3) + expected2 = Series(expected2, dtype=any_string_dtype) + tm.assert_series_equal(result, expected2) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dropna, expected1, expected2", [(True, ["foo"], ["foo"]), (False, ["foo"], [np.nan])], @@ -1521,12 +1539,12 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): def test_mode_mixeddtype(self, dropna, expected1, expected2): s = Series([1, "foo", "foo"]) result = s.mode(dropna) - expected = Series(expected1) + expected = Series(expected1, dtype=object) tm.assert_series_equal(result, expected) s = Series([1, "foo", "foo", np.nan, np.nan, np.nan]) result = s.mode(dropna) - expected = Series(expected2, dtype=None if expected2 == ["foo"] else object) + expected = Series(expected2, dtype=object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1651,12 +1669,11 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted - expected = Series(["foo", np.nan]) + expected = Series(["foo", np.nan], dtype=object) s = Series([1, "foo", "foo", np.nan, np.nan]) with tm.assert_produces_warning(UserWarning): From aedb17af73240595ebac9d7df21303ccbd036434 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 31 Oct 2024 09:13:38 -0700 Subject: [PATCH 296/396] Backport PR #60150 on branch 2.3.x (CI: remove uninstall of nomkl) (#60160) Backport PR #60150: CI: remove uninstall of nomkl Co-authored-by: Joris Van den Bossche --- .github/actions/build_pandas/action.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 85b44ab24b36d..63f687324b0ae 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -28,13 +28,6 @@ runs: fi shell: bash -el {0} - - name: Uninstall nomkl - run: | - if conda list nomkl | grep nomkl 1>/dev/null; then - conda remove nomkl -y - fi - shell: bash -el {0} - - name: Build Pandas run: | export CFLAGS="$CFLAGS ${{ inputs.cflags_adds }}" From 70c1febd651a81c20f6ec358931d839d5548abd0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 08:30:07 +0100 Subject: [PATCH 297/396] [backport 2.3.x] String dtype: deprecate the pyarrow_numpy storage option (#60152) (#60173) String dtype: deprecate the pyarrow_numpy storage option (#60152) * String dtype: deprecate the pyarrow_numpy storage option * add pyarrow skip (cherry picked from commit 1908f2eb962e5c8b84483a6f0582b5e32b6f0ee8) --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/arrays/string_.py | 15 +++++++++++++-- pandas/tests/arrays/string_/test_string.py | 8 ++++++++ pandas/tests/extension/test_string.py | 4 ++-- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index cc561a888f843..cda4754bb4acd 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -54,6 +54,7 @@ notable_bug_fix1 Deprecations ~~~~~~~~~~~~ - Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`) +- Deprecated the ``"pyarrow_numpy"`` storage option for :class:`StringDtype` (:issue:`60152`) - The deprecation of setting the argument ``include_groups`` to ``True`` in :meth:`DataFrameGroupBy.apply` has been promoted from a ``DeprecationWarning`` to ``FutureWarning``; only ``False`` will be allowed (:issue:`7155`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index faad516b53a4a..4081e3e1a4779 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -7,6 +7,7 @@ Literal, cast, ) +import warnings import numpy as np @@ -27,6 +28,7 @@ ) from pandas.compat.numpy import function as nv from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( ExtensionDtype, @@ -150,7 +152,16 @@ def __init__( storage = "python" if storage == "pyarrow_numpy": - # TODO raise a deprecation warning + warnings.warn( + "The 'pyarrow_numpy' storage option name is deprecated and will be " + 'removed in pandas 3.0. Use \'pd.StringDtype(storage="pyarrow", ' + "na_value-np.nan)' to construct the same dtype.\nOr enable the " + "'pd.options.future.infer_string = True' option globally and use " + 'the "str" alias as a shorthand notation to specify a dtype ' + '(instead of "string[pyarrow_numpy]").', + FutureWarning, + stacklevel=find_stack_level(), + ) storage = "pyarrow" na_value = np.nan @@ -250,7 +261,7 @@ def construct_from_string(cls, string) -> Self: elif string == "string[pyarrow]": return cls(storage="pyarrow") elif string == "string[pyarrow_numpy]": - # TODO deprecate + # this is deprecated in the dtype __init__, remove this in pandas 3.0 return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 73e8bde827d50..e511ba62d5d09 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -41,6 +41,14 @@ def cls(dtype): return dtype.construct_array_type() +def test_dtype_constructor(): + pytest.importorskip("pyarrow") + + with tm.assert_produces_warning(FutureWarning): + dtype = pd.StringDtype("pyarrow_numpy") + assert dtype == pd.StringDtype("pyarrow", na_value=np.nan) + + def test_dtype_equality(): pytest.importorskip("pyarrow") diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 6af095e33396c..7997bca5c1c9b 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -104,8 +104,8 @@ def test_eq_with_str(self, dtype): # only the NA-variant supports parametrized string alias assert dtype == f"string[{dtype.storage}]" elif dtype.storage == "pyarrow": - # TODO(infer_string) deprecate this - assert dtype == "string[pyarrow_numpy]" + with tm.assert_produces_warning(FutureWarning): + assert dtype == "string[pyarrow_numpy]" def test_is_not_string_type(self, dtype): # Different from BaseDtypeTests.test_is_not_string_type From 826c2308e00c1fbb3bccde4645b37f70ef757ede Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 08:30:48 +0100 Subject: [PATCH 298/396] [backport 2.3.x] CI: fix wrong syntax in CI env yml files (mamba 2.0 compat) (#59910) (#60175) CI: fix wrong syntax in CI env yml files (mamba 2.0 compat) (#59910) * fix wrong syntax in CI env yml files * fix other files * remove micromamba pin (cherry picked from commit de4eaf8b2e7c6b840dbd0198d8c3edf5eaf5afff) --- .github/actions/setup-conda/action.yml | 2 -- ci/deps/actions-310.yaml | 4 ++-- ci/deps/actions-311-downstream_compat.yaml | 4 ++-- ci/deps/actions-311-numpydev.yaml | 4 ++-- ci/deps/actions-311-pyarrownightly.yaml | 4 ++-- ci/deps/actions-311.yaml | 4 ++-- ci/deps/actions-312.yaml | 4 ++-- ci/deps/actions-39-minimum_versions.yaml | 4 ++-- ci/deps/actions-pypy-39.yaml | 4 ++-- ci/deps/circle-310-arm64.yaml | 4 ++-- environment.yml | 4 ++-- scripts/generate_pip_deps_from_conda.py | 2 ++ 12 files changed, 22 insertions(+), 22 deletions(-) diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 4fe901998cbcc..3eb68bdd2a15c 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -9,8 +9,6 @@ runs: - name: Install ${{ inputs.environment-file }} uses: mamba-org/setup-micromamba@v1 with: - # Pinning to avoid 2.0 failures - micromamba-version: '1.5.10-0' environment-file: ${{ inputs.environment-file }} environment-name: test condarc-file: ci/.condarc diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 7cb2d8171c0cb..0572091d88cd9 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -5,9 +5,9 @@ dependencies: - python=3.10 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index d8c3e6e220630..f7e9ad045ed04 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -6,9 +6,9 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 21791e3a9c2eb..d714e99c765e7 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -5,8 +5,8 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] - - meson[ninja]=1.2.1 + - versioneer + - meson=1.2.1 - meson-python=0.13.1 - cython>=0.29.33 diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 978611b3fa96f..ba655f9690af6 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -5,8 +5,8 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] - - meson[ninja]=1.2.1 + - versioneer + - meson=1.2.1 - cython>=0.29.33 - meson-python=0.13.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 9a0cb5ab81d23..db89be7780bf0 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -5,9 +5,9 @@ dependencies: - python=3.11 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index bf79d14cd1b78..4d690501571a7 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -5,9 +5,9 @@ dependencies: - python=3.12 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 7067048c4434d..6e38a7c5f0774 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -7,9 +7,9 @@ dependencies: - python=3.9 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index bdc07931988d1..ba518312df24c 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -8,9 +8,9 @@ dependencies: - python=3.9[build=*_pypy] # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 5433d00bb94b5..eeb1cb48b1018 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -5,9 +5,9 @@ dependencies: - python=3.10 # build dependencies - - versioneer[toml] + - versioneer - cython>=0.29.33 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/environment.yml b/environment.yml index 8987623bd865e..d5d6c329dae8a 100644 --- a/environment.yml +++ b/environment.yml @@ -7,9 +7,9 @@ dependencies: - pip # build dependencies - - versioneer[toml] + - versioneer - cython=3.0.5 - - meson[ninja]=1.2.1 + - meson=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 5fcf09cd073fe..1e6e8585f0b90 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -26,6 +26,8 @@ EXCLUDE = {"python", "c-compiler", "cxx-compiler"} REMAP_VERSION = {"tzdata": "2022.7"} CONDA_TO_PIP = { + "versioneer": "versioneer[toml]", + "meson": "meson[ninja]", "pytables": "tables", "psycopg2": "psycopg2-binary", "dask-core": "dask", From 6654c02ea411ba5c8567d16b79ffba0c42b351cf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 08:31:21 +0100 Subject: [PATCH 299/396] [backport 2.3.x] BUG: preserve (object) dtype in factorize (#60118) (#60174) BUG: preserve (object) dtype in factorize (#60118) * BUG: preserve (object) dtype in factorize * add fallback for float16 (cherry picked from commit 13926e5e298acf328b0c1347f008ef3f9c4eb078) --- pandas/core/base.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index e98f1157572bb..a67003895d288 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -48,6 +48,7 @@ from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndex, + ABCMultiIndex, ABCSeries, ) from pandas.core.dtypes.missing import ( @@ -1198,13 +1199,18 @@ def factorize( if uniques.dtype == np.float16: uniques = uniques.astype(np.float32) - if isinstance(self, ABCIndex): - # preserve e.g. MultiIndex + if isinstance(self, ABCMultiIndex): + # preserve MultiIndex uniques = self._constructor(uniques) else: from pandas import Index - uniques = Index(uniques) + try: + uniques = Index(uniques, dtype=self.dtype) + except NotImplementedError: + # not all dtypes are supported in Index that are allowed for Series + # e.g. float16 or bytes + uniques = Index(uniques) return codes, uniques _shared_docs[ From ce56f2e47602b53ea28fd70ba1b92e60126ee132 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 09:52:07 +0100 Subject: [PATCH 300/396] [backport 2.3.x] TST (string dtype): add explicit object vs str dtype to index fixture (#60116) (#60136) (cherry picked from commit 7bd594c81acb5f6428e9ef54ba5a9da1f2860a89) --- pandas/conftest.py | 3 ++- pandas/core/algorithms.py | 2 +- pandas/tests/base/test_misc.py | 1 + pandas/tests/indexes/test_any_index.py | 2 +- pandas/tests/indexes/test_common.py | 1 + pandas/tests/indexes/test_old_base.py | 3 ++- pandas/tests/indexes/test_setops.py | 8 +++++++- pandas/tests/series/methods/test_map.py | 1 + pandas/tests/test_algos.py | 1 + 9 files changed, 17 insertions(+), 5 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index c6237d0309630..b0818b11ab037 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -615,7 +615,8 @@ def _create_mi_with_dt64tz_level(): indices_dict = { - "string": Index([f"pandas_{i}" for i in range(100)]), + "object": Index([f"pandas_{i}" for i in range(100)], dtype=object), + "string": Index([f"pandas_{i}" for i in range(100)], dtype="str"), "datetime": date_range("2020-01-01", periods=100), "datetime-tz": date_range("2020-01-01", periods=100, tz="US/Pacific"), "period": period_range("2020-01-01", periods=100, freq="D"), diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 56600bd9a5107..085a4ee41dcc9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -931,7 +931,7 @@ def value_counts_internal( # For backwards compatibility, we let Index do its normal type # inference, _except_ for if if infers from object to bool. idx = Index(keys) - if idx.dtype == bool and keys.dtype == object: + if idx.dtype in [bool, "string"] and keys.dtype == object: idx = idx.astype(object) elif ( idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index b42e01c76335c..1bf0a8d75dd4f 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -165,6 +165,7 @@ def test_searchsorted(request, index_or_series_obj): assert 0 <= index <= len(obj) +@pytest.mark.filterwarnings(r"ignore:Dtype inference:FutureWarning") def test_access_by_position(index_flat): index = index_flat diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 10204cfb78e89..8edeaf9c16083 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -45,7 +45,7 @@ def test_map_identity_mapping(index, request): # GH#12766 result = index.map(lambda x: x) - if index.dtype == object and result.dtype == bool: + if index.dtype == object and result.dtype in [bool, "string"]: assert (index == result).all() # TODO: could work that into the 'exact="equiv"'? return # FIXME: doesn't belong in this file anymore! diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 05b2aa584674c..c08fcdaedbefe 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -147,6 +147,7 @@ def test_copy_and_deepcopy(self, index_flat): new_copy = index.copy(deep=True, name="banana") assert new_copy.name == "banana" + @pytest.mark.filterwarnings(r"ignore:Dtype inference:FutureWarning") def test_copy_name(self, index_flat): # GH#12309: Check that the "name" argument # passed at initialization is honored. diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 176bf893cafa8..2f6bdb1fd8969 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -260,7 +260,7 @@ def test_ensure_copied_data(self, index): "RangeIndex cannot be initialized from data, " "MultiIndex and CategoricalIndex are tested separately" ) - elif index.dtype == object and index.inferred_type == "boolean": + elif index.dtype == object and index.inferred_type in ["boolean", "string"]: init_kwargs["dtype"] = index.dtype index_type = type(index) @@ -485,6 +485,7 @@ def test_delete_base(self, index): with pytest.raises(IndexError, match=msg): index.delete(length) + @pytest.mark.filterwarnings(r"ignore:Dtype inference:FutureWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_equals(self, index): if isinstance(index, IntervalIndex): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 72c3396f124b8..3845744dc0717 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -293,7 +293,13 @@ def test_difference_base(self, sort, index): first.difference([1, 2, 3], sort) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - def test_symmetric_difference(self, index): + def test_symmetric_difference(self, index, using_infer_string, request): + if ( + using_infer_string + and index.dtype == "object" + and index.inferred_type == "string" + ): + request.applymarker(pytest.mark.xfail(reason="TODO: infer_string")) if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") if len(index) < 2: diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index e5281a18236da..f33f5edb5ee66 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -221,6 +221,7 @@ def test_map_category_string(): tm.assert_series_equal(a.map(c), exp) +@pytest.mark.filterwarnings(r"ignore:Dtype inference:FutureWarning") def test_map_empty(request, index): if isinstance(index, MultiIndex): request.applymarker( diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a7c2ec5acb7c2..97d6415e0de05 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -65,6 +65,7 @@ def test_factorize_complex(self): expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=object) tm.assert_numpy_array_equal(uniques, expected_uniques) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("sort", [True, False]) def test_factorize(self, index_or_series_obj, sort): obj = index_or_series_obj From e647fac2e5406399067654f981bcff6ec0b3afc8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2024 07:58:52 +0100 Subject: [PATCH 301/396] [backport 2.3.x] BUG: Fix copy semantics in ``__array__`` (#60046) (#60189) (cherry picked from commit eacf0326efb709169ebc49f040834670dfe4beb3) Co-authored-by: Joris Van den Bossche Co-authored-by: Sebastian Berg --- doc/source/whatsnew/v2.3.0.rst | 3 ++ pandas/core/arrays/arrow/array.py | 11 ++++- pandas/core/arrays/categorical.py | 33 +++++++++++---- pandas/core/arrays/datetimelike.py | 7 ++++ pandas/core/arrays/interval.py | 5 +++ pandas/core/arrays/masked.py | 12 +++++- pandas/core/arrays/numpy_.py | 3 ++ pandas/core/arrays/period.py | 15 ++++++- pandas/core/arrays/sparse/array.py | 15 +++++-- pandas/core/generic.py | 10 ++++- pandas/core/indexes/base.py | 6 ++- pandas/core/indexes/multi.py | 9 ++++ pandas/core/internals/construction.py | 12 +++--- pandas/core/series.py | 15 +++++-- pandas/tests/arrays/sparse/test_array.py | 31 ++++++++++++++ pandas/tests/arrays/test_datetimelike.py | 8 ++++ pandas/tests/base/test_conversion.py | 41 ++++++++++++++++--- pandas/tests/extension/base/interface.py | 21 ++++++++++ pandas/tests/extension/json/array.py | 10 ++++- pandas/tests/indexes/multi/test_conversion.py | 36 ++++++++++++++++ 20 files changed, 269 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index cda4754bb4acd..405c8fdc30961 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -32,6 +32,9 @@ enhancement1 Other enhancements ^^^^^^^^^^^^^^^^^^ +- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called + when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been + updated to work correctly with NumPy >= 2 (:issue:`57739`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 51136961c0fb3..2587d1d53640d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -662,7 +662,16 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) + if copy is False: + # TODO: By using `zero_copy_only` it may be possible to implement this + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + elif copy is None: + # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`. + copy = False + + return self.to_numpy(dtype=dtype, copy=copy) def __invert__(self) -> Self: # This is a bit wise op for integer types diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 366253a923f6c..3383f35bb7d55 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -577,11 +577,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: raise ValueError("Cannot convert float NaN to integer") elif len(self.codes) == 0 or len(self.categories) == 0: - result = np.array( - self, - dtype=dtype, - copy=copy, - ) + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + result = np.asarray(self, dtype=dtype) + else: + result = np.array(self, dtype=dtype) else: # GH8628 (PERF): astype category codes instead of astyping array @@ -1642,6 +1643,17 @@ def __array__( """ The numpy array interface. + Users should not call this directly. Rather, it is invoked by + :func:`numpy.array` and :func:`numpy.asarray`. + + Parameters + ---------- + dtype : np.dtype or None + Specifies the the dtype for the array. + + copy : bool or None, optional + See :func:`numpy.asarray`. + Returns ------- numpy.array @@ -1659,13 +1671,18 @@ def __array__( >>> np.asarray(cat) array(['a', 'b'], dtype=object) """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + ret = take_nd(self.categories._values, self._codes) - if dtype and np.dtype(dtype) != self.categories.dtype: - return np.asarray(ret, dtype) # When we're a Categorical[ExtensionArray], like Interval, # we need to ensure __array__ gets all the way to an # ndarray. - return np.asarray(ret) + + # `take_nd` should already make a copy, so don't force again. + return np.asarray(ret, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 81e2f04f2ba2e..990116bad13d1 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -358,7 +358,14 @@ def __array__( ) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) return np.array(list(self), dtype=object) + + if copy is True: + return np.array(self._ndarray, dtype=dtype) return self._ndarray @overload diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 91db7f11bcbe0..5aac3d3b28db5 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1574,6 +1574,11 @@ def __array__( Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + left = self._left right = self._right mask = self.isna() diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d7e816b9d3781..ba7b8e3e7398e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -600,7 +600,17 @@ def __array__( the array interface, return my values We return an object array here to preserve our scalar values """ - return self.to_numpy(dtype=dtype) + if copy is False: + if not self._hasna: + # special case, here we can simply return the underlying data + return np.array(self._data, dtype=dtype, copy=copy) + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if copy is None: + copy = False # The NumPy copy=False meaning is different here. + return self.to_numpy(dtype=dtype, copy=copy) _HANDLED_TYPES: tuple[type, ...] diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index aafcd82114b97..9f7238a97d808 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -150,6 +150,9 @@ def dtype(self) -> NumpyEADtype: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if copy is not None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.array(self._ndarray, dtype=dtype, copy=copy) return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index c1229e27ab51a..aad7737b8dd94 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -407,8 +407,19 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: if dtype == "i8": - return self.asi8 - elif dtype == bool: + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + return np.asarray(self.asi8, dtype=dtype) + else: + return np.array(self.asi8, dtype=dtype) + + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if dtype == bool: return ~self._isnan # This will raise TypeError for non-object dtypes diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 82fcfa74ec7d2..13577e366d54b 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -554,11 +554,20 @@ def from_spmatrix(cls, data: spmatrix) -> Self: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: - fill_value = self.fill_value - if self.sp_index.ngaps == 0: # Compat for na dtype and int values. - return self.sp_values + if copy is True: + return np.array(self.sp_values) + else: + return self.sp_values + + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + fill_value = self.fill_value + if dtype is None: # Can NumPy represent this type? # If not, `np.result_type` will raise. We catch that diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 796357355fef4..bef2d1e1194f9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2150,9 +2150,15 @@ def __array__( self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None ) -> np.ndarray: values = self._values - arr = np.asarray(values, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + if ( - astype_is_view(values.dtype, arr.dtype) + copy is not True + and astype_is_view(values.dtype, arr.dtype) and using_copy_on_write() and self._mgr.is_single_block ): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8e8eb768130fd..fc53e044a6544 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -917,7 +917,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: """ The array interface, return my values. """ - return np.asarray(self._data, dtype=dtype) + if copy is None: + # Note, that the if branch exists for NumPy 1.x support + return np.asarray(self._data, dtype=dtype) + + return np.array(self._data, dtype=dtype, copy=copy) def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 091ddbcc099be..9e002ccd3a787 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1311,6 +1311,15 @@ def copy( # type: ignore[override] def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" + if copy is False: + # self.values is always a newly construct array, so raise. + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if copy is True: + # explicit np.array call to ensure a copy is made and unique objects + # are returned, because self.values is cached + return np.array(self.values, dtype=dtype) return self.values def view(self, cls=None) -> Self: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f3dbacc02bec9..137648ee52bf7 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -305,12 +305,12 @@ def ndarray_to_mgr( elif isinstance(values, (np.ndarray, ExtensionArray)): # drop subclass info - _copy = ( - copy_on_sanitize - if (dtype is None or astype_is_view(values.dtype, dtype)) - else False - ) - values = np.array(values, copy=_copy) + if copy_on_sanitize and (dtype is None or astype_is_view(values.dtype, dtype)): + # only force a copy now if copy=True was requested + # and a subsequent `astype` will not already result in a copy + values = np.array(values, copy=True, order="F") + else: + values = np.asarray(values) values = _ensure_2d(values) else: diff --git a/pandas/core/series.py b/pandas/core/series.py index 6fd019656d207..4e2e363885594 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -991,7 +991,7 @@ def __array__( the dtype is inferred from the data. copy : bool or None, optional - Unused. + See :func:`numpy.asarray`. Returns ------- @@ -1028,8 +1028,17 @@ def __array__( dtype='datetime64[ns]') """ values = self._values - arr = np.asarray(values, dtype=dtype) - if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype): + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + + if copy is True: + return arr + if using_copy_on_write() and ( + copy is False or astype_is_view(values.dtype, arr.dtype) + ): arr = arr.view() arr.flags.writeable = False return arr diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 883d6ea3959ff..1e8d36b184e48 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.sparse import IntIndex +from pandas.compat.numpy import np_version_gt2 import pandas as pd from pandas import ( @@ -478,3 +479,33 @@ def test_zero_sparse_column(): expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2]) tm.assert_frame_equal(result, expected) + + +def test_array_interface(arr_data, arr): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(arr) + tm.assert_numpy_array_equal(result, arr_data) + + # it always gives a copy by default + result_copy1 = np.asarray(arr) + result_copy2 = np.asarray(arr) + assert not np.may_share_memory(result_copy1, result_copy2) + + # or with explicit copy=True + result_copy1 = np.array(arr, copy=True) + result_copy2 = np.array(arr, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for sparse arrays, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(arr, copy=False) + + # except when there are actually no sparse filled values + arr2 = SparseArray(np.array([1, 2, 3])) + result_nocopy1 = np.array(arr2, copy=False) + result_nocopy2 = np.array(arr2, copy=False) + assert np.may_share_memory(result_nocopy1, result_nocopy2) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index ede81264cb415..0397913b69b26 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1148,9 +1148,17 @@ def test_array_interface(self, arr1d): result = np.asarray(arr, dtype=object) tm.assert_numpy_array_equal(result, expected) + # to int64 gives the underlying representation result = np.asarray(arr, dtype="int64") tm.assert_numpy_array_equal(result, arr.asi8) + result2 = np.asarray(arr, dtype="int64") + assert np.may_share_memory(result, result2) + + result_copy1 = np.array(arr, dtype="int64", copy=True) + result_copy2 = np.array(arr, dtype="int64", copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + # to other dtypes msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index d62599c56e467..e2bf19e2e736c 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -4,6 +4,7 @@ from pandas._config import using_string_dtype from pandas.compat import HAS_PYARROW +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -297,24 +298,27 @@ def test_array_multiindex_raises(): @pytest.mark.parametrize( - "arr, expected", + "arr, expected, zero_copy", [ - (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), - (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64), True), + (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object), False), ( pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), + False, ), - (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])), + (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan]), False), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), + False, ), - (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64), False), # tz-naive datetime ( DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")), np.array(["2000", "2001"], dtype="M8[ns]"), + True, ), # tz-aware stays tz`-aware ( @@ -329,6 +333,7 @@ def test_array_multiindex_raises(): Timestamp("2000-01-02", tz="US/Central"), ] ), + False, ), # Timedelta ( @@ -336,6 +341,7 @@ def test_array_multiindex_raises(): np.array([0, 3600000000000], dtype="i8").view("m8[ns]") ), np.array([0, 3600000000000], dtype="m8[ns]"), + True, ), # GH#26406 tz is preserved in Categorical[dt64tz] ( @@ -346,10 +352,11 @@ def test_array_multiindex_raises(): Timestamp("2016-01-02", tz="US/Pacific"), ] ), + False, ), ], ) -def test_to_numpy(arr, expected, index_or_series_or_array, request): +def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): box = index_or_series_or_array with tm.assert_produces_warning(None): @@ -361,6 +368,28 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): result = np.asarray(thing) tm.assert_numpy_array_equal(result, expected) + # Additionally, we check the `copy=` semantics for array/asarray + # (these are implemented by us via `__array__`). + result_cp1 = np.array(thing, copy=True) + result_cp2 = np.array(thing, copy=True) + # When called with `copy=True` NumPy/we should ensure a copy was made + assert not np.may_share_memory(result_cp1, result_cp2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + if not zero_copy: + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + # An error is always acceptable for `copy=False` + np.array(thing, copy=False) + + else: + result_nocopy1 = np.array(thing, copy=False) + result_nocopy2 = np.array(thing, copy=False) + # If copy=False was given, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + @pytest.mark.xfail( using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 6683c87e2b8fc..79eb64b5a654f 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import ExtensionDtype @@ -71,6 +73,25 @@ def test_array_interface(self, data): expected = construct_1d_object_array_from_listlike(list(data)) tm.assert_numpy_array_equal(result, expected) + def test_array_interface_copy(self, data): + result_copy1 = np.array(data, copy=True) + result_copy2 = np.array(data, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + try: + result_nocopy1 = np.array(data, copy=False) + except ValueError: + # An error is always acceptable for `copy=False` + return + + result_nocopy2 = np.array(data, copy=False) + # If copy=False was given and did not raise, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) assert is_extension_array_dtype(data.dtype) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 5cbd45a99ae5c..b6d72c10712f2 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -147,12 +147,20 @@ def __ne__(self, other): return NotImplemented def __array__(self, dtype=None, copy=None): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if dtype is None: dtype = object if dtype == object: # on py38 builds it looks like numpy is inferring to a non-1D array return construct_1d_object_array_from_listlike(list(self)) - return np.asarray(self.data, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self.data, dtype=dtype) + return np.asarray(self.data, dtype=dtype, copy=copy) @property def nbytes(self) -> int: diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 3c2ca045d6f99..58a2dc00f937d 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + import pandas as pd from pandas import ( DataFrame, @@ -15,6 +17,40 @@ def test_to_numpy(idx): tm.assert_numpy_array_equal(result, exp) +def test_array_interface(idx): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(idx) + expected = np.empty((6,), dtype=object) + expected[:] = [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ] + tm.assert_numpy_array_equal(result, expected) + + # it always gives a copy by default, but the values are cached, so results + # are still sharing memory + result_copy1 = np.asarray(idx) + result_copy2 = np.asarray(idx) + assert np.may_share_memory(result_copy1, result_copy2) + + # with explicit copy=True, then it is an actual copy + result_copy1 = np.array(idx, copy=True) + result_copy2 = np.array(idx, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for MultiIndex, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(idx, copy=False) + + def test_to_frame(): tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] From 6a9ef0ca4a2bac53faaeff9ebc0a17cda3535373 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2024 07:59:48 +0100 Subject: [PATCH 302/396] [backport 2.3.x] TST (string dtype): fix invalid comparison error message and update test (#60176) (#60188) (cherry picked from commit 9ec4a9150ef6dbf6da1248b7252141d48203d941) --- pandas/core/arrays/arrow/array.py | 2 +- pandas/tests/frame/test_arithmetic.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2587d1d53640d..8cf763265fd34 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -737,7 +737,7 @@ def _cmp_method(self, other, op): try: result[valid] = op(np_array[valid], other) except TypeError: - result = ops.invalid_comparison(np_array, other, op) + result = ops.invalid_comparison(self, other, op) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) else: diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 6b4efc41aeffa..eb85c108ca238 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -13,7 +13,6 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -1563,9 +1562,6 @@ def test_comparisons(self, simple_frame, float_frame, func): with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): # GH 11565 df = DataFrame( @@ -1573,7 +1569,12 @@ def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne) ) f = getattr(operator, compare_operators_no_eq_ne) - msg = "'[<>]=?' not supported between instances of 'str' and 'int'" + msg = "|".join( + [ + "'[<>]=?' not supported between instances of 'str' and 'int'", + "Invalid comparison between dtype=str and int", + ] + ) with pytest.raises(TypeError, match=msg): f(df, 0) From 6f4ebe6cda2a1abeb0d26f22c72d64d4bae89273 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 4 Nov 2024 23:00:28 -0800 Subject: [PATCH 303/396] Backport PR #60134 on branch 2.3.x (TST (string dtype): remove xfails in extension tests + fix categorical/string dispatch) (#60178) Backport PR #60134: TST (string dtype): remove xfails in extension tests + fix categorical/string dispatch Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_.py | 1 - pandas/tests/extension/base/ops.py | 26 ---------------------- pandas/tests/extension/test_categorical.py | 2 -- pandas/tests/extension/test_numpy.py | 7 ------ 4 files changed, 36 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 4081e3e1a4779..f5c5cb2a45034 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -915,7 +915,6 @@ def _cmp_method(self, other, op): if not is_array_like(other): other = np.asarray(other) other = other[valid] - other = np.asarray(other) if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 547114ecfddd0..222ff42d45052 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,10 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.common import is_string_dtype import pandas as pd @@ -134,12 +130,6 @@ class BaseArithmeticOpsTests(BaseOpsUtil): series_array_exc: type[Exception] | None = TypeError divmod_exc: type[Exception] | None = TypeError - # TODO(infer_string) need to remove import of pyarrow - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -149,11 +139,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): ser = pd.Series(data) self.check_opname(ser, op_name, ser.iloc[0]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -163,22 +148,12 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): df = pd.DataFrame({"A": data}) self.check_opname(df, op_name, data[0]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op_name = all_arithmetic_operators ser = pd.Series(data) self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser))) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_divmod(self, data): ser = pd.Series(data) self._check_divmod_op(ser, divmod, 1) @@ -194,7 +169,6 @@ def test_divmod_series_array(self, data, data_for_twos): other = pd.Series(other) self._check_divmod_op(other, ops.rdivmod, ser) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_add_series_with_extension_array(self, data): # Check adding an ExtensionArray to a Series of the same dtype matches # the behavior of adding the arrays directly and then wrapping in a diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index fd291908a4f96..135ea67c924d0 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -148,7 +148,6 @@ def test_map(self, data, na_action): result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): # frame & scalar op_name = all_arithmetic_operators @@ -160,7 +159,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): ) super().test_arith_frame_with_scalar(data, op_name) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): op_name = all_arithmetic_operators if op_name == "__rmod__": diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index b79b0a98efde4..e38144f4c615b 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import NumpyEADtype import pandas as pd @@ -244,7 +242,6 @@ def test_insert_invalid(self, data, invalid_scalar): frame_scalar_exc = None series_array_exc = None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod(self, data): divmod_exc = None if data.dtype.kind == "O": @@ -252,7 +249,6 @@ def test_divmod(self, data): self.divmod_exc = divmod_exc super().test_divmod(data) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod_series_array(self, data): ser = pd.Series(data) exc = None @@ -261,7 +257,6 @@ def test_divmod_series_array(self, data): self.divmod_exc = exc self._check_divmod_op(ser, divmod, data) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators series_scalar_exc = None @@ -275,7 +270,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) self.series_scalar_exc = series_scalar_exc super().test_arith_series_with_scalar(data, all_arithmetic_operators) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_array(self, data, all_arithmetic_operators): opname = all_arithmetic_operators series_array_exc = None @@ -284,7 +278,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): self.series_array_exc = series_array_exc super().test_arith_series_with_array(data, all_arithmetic_operators) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators frame_scalar_exc = None From a53604db7364ec467915b1e08d7e3bd6a0aa270e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2024 19:51:37 +0100 Subject: [PATCH 304/396] [backport 2.3.x] TST (string dtype): avoid hardcoded object dtype for columns in datetime_frame fixture (#60192) (#60198) TST (string dtype): avoid hardcoded object dtype for columns in datetime_frame fixture (#60192) (cherry picked from commit 34387bddffacb158a60a249b08411a8a1fe44455) --- pandas/tests/frame/conftest.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 1 - pandas/tests/frame/methods/test_to_csv.py | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index e07024b2e2a09..45b5d9b4aa698 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -18,7 +18,7 @@ def datetime_frame() -> DataFrame: """ return DataFrame( np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100, freq="B"), ) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 04dba325f060f..4094f14c50608 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -183,7 +183,6 @@ def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_fram if bif[c].dtype != bifw[c].dtype: assert bif[c].dtype == df[c].dtype - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_getitem_boolean_casting(self, datetime_frame): # don't upcast if we don't need to df = datetime_frame.copy() diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 4a65c3929944b..aca3bb5bccd7c 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -35,7 +35,6 @@ def read_csv(self, path, **kwargs): return read_csv(path, **params) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_from_csv1(self, float_frame, datetime_frame): with tm.ensure_clean("__tmp_to_csv_from_csv1__") as path: float_frame.iloc[:5, float_frame.columns.get_loc("A")] = np.nan @@ -533,7 +532,6 @@ def test_to_csv_headers(self): assert return_value is None tm.assert_frame_equal(to_df, recons) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_multiindex(self, float_frame, datetime_frame): frame = float_frame old_index = frame.index From 92a0c81c17466a24f22f25dd01ab6685bf857671 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Nov 2024 12:00:21 -0800 Subject: [PATCH 305/396] Backport PR #60197 on branch 2.3.x (TYP/COMPAT: don't use Literal for Series.ndim to avoid tab completion bug in IPython) (#60200) Backport PR #60197: TYP/COMPAT: don't use Literal for Series.ndim to avoid tab completion bug in IPython Co-authored-by: Joris Van den Bossche --- pandas/core/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index a67003895d288..af8f80db6a347 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -361,8 +361,11 @@ def __len__(self) -> int: # We need this defined here for mypy raise AbstractMethodError(self) + # Temporarily avoid using `-> Literal[1]:` because of an IPython (jedi) bug + # https://github.com/ipython/ipython/issues/14412 + # https://github.com/davidhalter/jedi/issues/1990 @property - def ndim(self) -> Literal[1]: + def ndim(self) -> int: """ Number of dimensions of the underlying data, by definition 1. From 70e8a3ba187b49651910538c4fe9290ffbec96c3 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Nov 2024 12:00:58 -0800 Subject: [PATCH 306/396] Backport PR #60196 on branch 2.3.x (BUG: fix inspect usage when pyarrow or jinja2 is not installed) (#60201) Backport PR #60196: BUG: fix inspect usage when pyarrow or jinja2 is not installed Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.3.0.rst | 3 ++- pandas/core/arrays/arrow/accessors.py | 2 +- pandas/core/frame.py | 5 +++++ pandas/tests/frame/test_api.py | 1 - pandas/tests/series/test_api.py | 8 -------- 5 files changed, 8 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 405c8fdc30961..0751554d87dc8 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -174,7 +174,8 @@ Styler Other ^^^^^ -- +- Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2`` + are not installed (:issue:`60196`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 124f8fb6ad8bc..65f0784eaa3fd 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -46,7 +46,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: def _validate(self, data): dtype = data.dtype - if not isinstance(dtype, ArrowDtype): + if pa_version_under10p1 or not isinstance(dtype, ArrowDtype): # Raise AttributeError so that inspect can handle non-struct Series. raise AttributeError(self._validation_msg.format(dtype=dtype)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1403fc2ceaaf8..ef48090f02c3f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1439,6 +1439,11 @@ def style(self) -> Styler: Please see `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. """ + # Raise AttributeError so that inspect works even if jinja2 is not installed. + has_jinja2 = import_optional_dependency("jinja2", errors="ignore") + if not has_jinja2: + raise AttributeError("The '.style' accessor requires jinja2") + from pandas.io.formats.style import Styler return Styler(self) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index f6c7bd1f49b27..6c6944f806a2a 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -387,7 +387,6 @@ def test_constructor_expanddim(self): def test_inspect_getmembers(self): # GH38740 - pytest.importorskip("jinja2") df = DataFrame() msg = "DataFrame._data is deprecated" with tm.assert_produces_warning( diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index e53cd753a4192..7e10a337cdd3a 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -4,10 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import ( DataFrame, @@ -171,12 +167,8 @@ def test_attrs(self): result = s + 1 assert result.attrs == {"version": 1} - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_inspect_getmembers(self): # GH38782 - pytest.importorskip("jinja2") ser = Series(dtype=object) msg = "Series._data is deprecated" with tm.assert_produces_warning( From b9f1bc6a773aee3f3bde6a226c7c43ccc0f04de4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Nov 2024 12:29:32 -0800 Subject: [PATCH 307/396] Backport PR #60195 on branch 2.3.x (BUG (string dtype): fix where() for string dtype with python storage) (#60202) Backport PR #60195: BUG (string dtype): fix where() for string dtype with python storage Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_.py | 6 ++++++ pandas/tests/frame/indexing/test_where.py | 18 ++++++------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f5c5cb2a45034..92c274453b9d1 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -757,6 +757,12 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) + def _where(self, mask: npt.NDArray[np.bool_], value) -> Self: + # the super() method NDArrayBackedExtensionArray._where uses + # np.putmask which doesn't properly handle None/pd.NA, so using the + # base class implementation that uses __setitem__ + return ExtensionArray._where(self, mask, value) + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: if isinstance(values, BaseStringArray) or ( isinstance(values, ExtensionArray) and is_string_dtype(values.dtype) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index f0d868a4cb583..40506c90f3295 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -6,8 +6,6 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -985,9 +983,6 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): obj.mask(mask, null) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) @given(data=OPTIONAL_ONE_OF_ALL) def test_where_inplace_casting(data): # GH 22051 @@ -1084,19 +1079,18 @@ def test_where_producing_ea_cond_for_np_dtype(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement, using_infer_string, request): +def test_where_int_overflow(replacement, using_infer_string): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) if using_infer_string and replacement not in (None, "snake"): - request.node.add_marker( - pytest.mark.xfail(reason="Can't set non-string into string column") - ) + with pytest.raises( + TypeError, match="Cannot set non-string value|Scalar must be NA or str" + ): + df.where(pd.notnull(df), replacement) + return result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) From a82cf8e3626d15f06cbef4463ac746750d2229c6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Nov 2024 19:38:14 -0800 Subject: [PATCH 308/396] Backport PR #60206 on branch 2.3.x (STY: Fix lint error in test_where.py) (#60208) Backport PR #60206: STY: Fix lint error in test_where.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/frame/indexing/test_where.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 40506c90f3295..0517485888b38 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_scalar import pandas as pd From a83184f826d905357ea4feebfc26f1e82fbcd439 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 6 Nov 2024 19:36:22 +0100 Subject: [PATCH 309/396] [backport 2.3.x] TST (string dtype): update all tests in tests/frame/indexing (#60193) (#60212) * TST (string dtype): update all tests in tests/frame/indexing (#60193) (cherry picked from commit bec2dbca274a4f983790d069279a4b3aec184f49) * update for 2.3.x --- pandas/tests/frame/indexing/test_coercion.py | 24 +++++++++++++----- pandas/tests/frame/indexing/test_indexing.py | 18 +++++--------- pandas/tests/frame/indexing/test_insert.py | 6 ++--- pandas/tests/frame/indexing/test_setitem.py | 26 +++++++++----------- pandas/tests/frame/indexing/test_where.py | 18 +++++++++----- pandas/tests/frame/indexing/test_xs.py | 5 +--- 6 files changed, 51 insertions(+), 46 deletions(-) diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index 9d20821ae8bc6..f7f7b2c7c872a 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -99,28 +97,42 @@ def test_6942(indexer_al): assert df.iloc[0, 0] == t2 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_26395(indexer_al): # .at case fixed by GH#45121 (best guess) df = DataFrame(index=["A", "B", "C"]) df["D"] = 0 indexer_al(df)["C", "D"] = 2 - expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) + expected = DataFrame( + {"D": [0, 0, 2]}, + index=["A", "B", "C"], + columns=pd.Index(["D"], dtype=object), + dtype=np.int64, + ) tm.assert_frame_equal(df, expected) with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype" ): indexer_al(df)["C", "D"] = 44.5 - expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64) + expected = DataFrame( + {"D": [0, 0, 44.5]}, + index=["A", "B", "C"], + columns=pd.Index(["D"], dtype=object), + dtype=np.float64, + ) tm.assert_frame_equal(df, expected) with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype" ): indexer_al(df)["C", "D"] = "hello" - expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object) + expected = DataFrame( + {"D": [0, 0, "hello"]}, + index=["A", "B", "C"], + columns=pd.Index(["D"], dtype=object), + dtype=object, + ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 4094f14c50608..1721fe2c0eb8b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -12,7 +12,6 @@ from pandas._config import using_string_dtype from pandas._libs import iNaT -from pandas.compat import HAS_PYARROW from pandas.errors import ( InvalidIndexError, PerformanceWarning, @@ -518,18 +517,17 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - def test_setitem_None(self, float_frame, using_infer_string): + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + def test_setitem_None(self, float_frame): # GH #766 float_frame[None] = float_frame["A"] - key = None if not using_infer_string else np.nan tm.assert_series_equal( float_frame.iloc[:, -1], float_frame["A"], check_names=False ) tm.assert_series_equal( - float_frame.loc[:, key], float_frame["A"], check_names=False + float_frame.loc[:, None], float_frame["A"], check_names=False ) - tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False) + tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False) def test_loc_setitem_boolean_mask_allfalse(self): # GH 9596 @@ -1191,7 +1189,6 @@ def test_setitem_with_unaligned_tz_aware_datetime_column(self): df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] tm.assert_series_equal(df["dates"], column) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_loc_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT @@ -1210,13 +1207,10 @@ def test_loc_setitem_datetimelike_with_inference(self): result = df.dtypes expected = Series( [np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2, - index=list("ABCDEFGH"), + index=Index(list("ABCDEFGH"), dtype=object), ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { @@ -1258,7 +1252,7 @@ def test_getitem_boolean_indexing_mixed(self): tm.assert_frame_equal(df2, expected) df["foo"] = "test" - msg = "not supported between instances|unorderable types" + msg = "not supported between instances|unorderable types|Invalid comparison" with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 82b75459f08d0..4cf297b4c037d 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import PerformanceWarning from pandas import ( @@ -62,7 +60,6 @@ def test_insert_column_bug_4032(self): expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_insert_with_columns_dups(self): # GH#14291 df = DataFrame() @@ -70,7 +67,8 @@ def test_insert_with_columns_dups(self): df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) exp = DataFrame( - [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], + columns=Index(["A", "A", "A"], dtype=object), ) tm.assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index bce3cb5dacabe..190218a82d231 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas.core.dtypes.base import _registry as ea_registry @@ -148,13 +146,16 @@ def test_setitem_different_dtype(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_empty_columns(self): # GH 13522 df = DataFrame(index=["A", "B", "C"]) df["X"] = df.index df["X"] = ["x", "y", "z"] - exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) + exp = DataFrame( + data={"X": ["x", "y", "z"]}, + index=["A", "B", "C"], + columns=Index(["X"], dtype=object), + ) tm.assert_frame_equal(df, exp) def test_setitem_dt64_index_empty_columns(self): @@ -164,14 +165,15 @@ def test_setitem_dt64_index_empty_columns(self): df["A"] = rng assert df["A"].dtype == np.dtype("M8[ns]") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_timestamp_empty_columns(self): # GH#19843 df = DataFrame(index=range(3)) df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns") expected = DataFrame( - [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"] + [[Timestamp("20130101", tz="UTC")]] * 3, + index=range(3), + columns=Index(["now"], dtype=object), ) tm.assert_frame_equal(df, expected) @@ -204,14 +206,13 @@ def test_setitem_with_unaligned_sparse_value(self): expected = Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_period_preserves_dtype(self): # GH: 26861 data = [Period("2003-12", "D")] result = DataFrame([]) result["a"] = data - expected = DataFrame({"a": data}) + expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object)) tm.assert_frame_equal(result, expected) @@ -677,11 +678,10 @@ def test_setitem_iloc_two_dimensional_generator(self): expected = DataFrame({"a": [1, 2, 3], "b": [4, 1, 1]}) tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_dtypes_bytes_type_to_object(self): # GH 20734 index = Series(name="id", dtype="S24") - df = DataFrame(index=index) + df = DataFrame(index=index, columns=Index([], dtype="str")) df["a"] = Series(name="a", index=index, dtype=np.uint32) df["b"] = Series(name="b", index=index, dtype="S64") df["c"] = Series(name="c", index=index, dtype="S64") @@ -712,7 +712,6 @@ def test_setitem_ea_dtype_rhs_series(self): # TODO(ArrayManager) set column with 2d column array, see #44788 @td.skip_array_manager_not_yet_implemented - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_npmatrix_2d(self): # GH#42376 # for use-case df["x"] = sparse.random((10, 10)).mean(axis=1) @@ -721,7 +720,7 @@ def test_setitem_npmatrix_2d(self): ) a = np.ones((10, 1)) - df = DataFrame(index=np.arange(10)) + df = DataFrame(index=np.arange(10), columns=Index([], dtype="str")) df["np-array"] = a # Instantiation of `np.matrix` gives PendingDeprecationWarning @@ -936,12 +935,11 @@ def test_setitem_with_expansion_categorical_dtype(self): ser.name = "E" tm.assert_series_equal(result2.sort_index(), ser.sort_index()) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_scalars_no_index(self): # GH#16823 / GH#17894 df = DataFrame() df["foo"] = 1 - expected = DataFrame(columns=["foo"]).astype(np.int64) + expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64) tm.assert_frame_equal(df, expected) def test_setitem_newcol_tuple_key(self, float_frame): diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 0517485888b38..861147f5b58dd 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -46,7 +46,6 @@ def is_ok(s): class TestDataFrameIndexingWhere: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_get(self, where_frame, float_string_frame): def _check_get(df, cond, check_dtypes=True): other1 = _safe_add(df) @@ -64,7 +63,10 @@ def _check_get(df, cond, check_dtypes=True): # check getting df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -98,7 +100,6 @@ def test_where_upcasting(self): tm.assert_series_equal(result, expected) @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): @@ -130,7 +131,10 @@ def _check_align(df, cond, other, check_dtypes=True): df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -174,7 +178,6 @@ def test_where_invalid(self): df.mask(0) @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): # where inplace @@ -196,7 +199,10 @@ def _check_set(df, cond, check_dtypes=True): df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index 4ca435fa5acc5..2aa27d1d6a548 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SettingWithCopyError from pandas import ( @@ -79,10 +77,9 @@ def test_xs( else: assert (expected == 5).all() - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_xs_corner(self): # pathological mixed-type reordering case - df = DataFrame(index=[0]) + df = DataFrame(index=[0], columns=Index([], dtype="str")) df["A"] = 1.0 df["B"] = "foo" df["C"] = 2.0 From 678266cade4e4f7baa36bd327aa3ca9c8bea061a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:04:42 -0800 Subject: [PATCH 310/396] Backport PR #60215 on branch 2.3.x (BUG (string dtype): fix escaping of newline/tab characters in the repr) (#60220) Backport PR #60215: BUG (string dtype): fix escaping of newline/tab characters in the repr Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_.py | 11 +++++++++++ pandas/tests/frame/test_repr.py | 3 --- pandas/tests/series/test_formats.py | 14 +++++++------- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 92c274453b9d1..4801b70a27dd4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial import operator from typing import ( TYPE_CHECKING, @@ -64,6 +65,8 @@ from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna +from pandas.io.formats import printing + if TYPE_CHECKING: import pyarrow @@ -387,6 +390,14 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: raise ValueError return cls._from_sequence(scalars, dtype=dtype) + def _formatter(self, boxed: bool = False): + formatter = partial( + printing.pprint_thing, + escape_chars=("\t", "\r", "\n"), + quote_strings=not boxed, + ) + return formatter + def _str_map( self, f, diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index f7700af6beea0..6184e791cab5d 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( NA, Categorical, @@ -176,7 +174,6 @@ def test_repr_mixed_big(self): repr(biggie) - @pytest.mark.xfail(using_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index 77e77a9337d63..4f93e7424bfd5 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -144,11 +142,13 @@ def test_tidy_repr_name_0(self, arg): rep_str = repr(ser) assert "Name: 0" in rep_str - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string): investigate failure" - ) - def test_newline(self): - ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) + def test_newline(self, any_string_dtype): + ser = Series( + ["a\n\r\tb"], + name="a\n\r\td", + index=Index(["a\n\r\tf"], dtype=any_string_dtype), + dtype=any_string_dtype, + ) assert "\t" not in repr(ser) assert "\r" not in repr(ser) assert "a\n" not in repr(ser) From b5d061542bd651ae1f59c3dcd49452cd7517c258 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:05:50 -0800 Subject: [PATCH 311/396] Backport PR #60222 on branch 2.3.x (ENH (string dtype): accept string_view in addition to string/large_string for ArrowStringArray input) (#60223) Backport PR #60222: ENH (string dtype): accept string_view in addition to string/large_string for ArrowStringArray input Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_arrow.py | 7 +++++++ pandas/tests/arrays/string_/test_string_arrow.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e7dd4f9dc5718..b6e98d8fdc7e5 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -18,6 +18,7 @@ from pandas.compat import ( pa_version_under10p1, pa_version_under13p0, + pa_version_under16p0, ) from pandas.util._exceptions import find_stack_level @@ -65,6 +66,10 @@ def _chk_pyarrow_available() -> None: raise ImportError(msg) +def _is_string_view(typ): + return not pa_version_under16p0 and pa.types.is_string_view(typ) + + # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from # ObjectStringArrayMixin because we want to have the object-dtype based methods as # fallback for the ones that pyarrow doesn't yet support @@ -122,11 +127,13 @@ def __init__(self, values) -> None: _chk_pyarrow_available() if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( pa.types.is_string(values.type) + or _is_string_view(values.type) or ( pa.types.is_dictionary(values.type) and ( pa.types.is_string(values.type.value_type) or pa.types.is_large_string(values.type.value_type) + or _is_string_view(values.type.value_type) ) ) ): diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 2f3840e92b62a..aa87f5fc0f49a 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -99,6 +99,20 @@ def test_constructor_valid_string_type_value_dictionary(string_type, chunked): assert pa.types.is_large_string(arr._pa_array.type) +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_valid_string_view(chunked): + # requires pyarrow>=18 for casting string_view to string + pa = pytest.importorskip("pyarrow", minversion="18") + + arr = pa.array(["1", "2", "3"], pa.string_view()) + if chunked: + arr = pa.chunked_array(arr) + + arr = ArrowStringArray(arr) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) + + def test_constructor_from_list(): # GH#27673 pytest.importorskip("pyarrow") From f8c7acc1c01545498b54c98683288377e5210fc3 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 7 Nov 2024 06:31:04 -0800 Subject: [PATCH 312/396] Backport PR #60191 on branch 2.3.x (TST: add extra test case for np.array(obj, copy=False) read-only behaviour) (#60226) Backport PR #60191: TST: add extra test case for np.array(obj, copy=False) read-only behaviour Co-authored-by: Joris Van den Bossche --- pandas/core/generic.py | 6 +++++ pandas/tests/copy_view/test_array.py | 37 ++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bef2d1e1194f9..e55a54112ee72 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2149,6 +2149,12 @@ def empty(self) -> bool_t: def __array__( self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None ) -> np.ndarray: + if copy is False and not self._mgr.is_single_block and not self.empty: + # check this manually, otherwise ._values will already return a copy + # and np.array(values, copy=False) will not raise an error + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) values = self._values if copy is None: # Note: branch avoids `copy=None` for NumPy 1.x support diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 9a3f83e0293f5..06d9424450011 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas import ( DataFrame, Series, @@ -15,8 +17,12 @@ @pytest.mark.parametrize( "method", - [lambda ser: ser.values, lambda ser: np.asarray(ser)], - ids=["values", "asarray"], + [ + lambda ser: ser.values, + lambda ser: np.asarray(ser), + lambda ser: np.array(ser, copy=False), + ], + ids=["values", "asarray", "array"], ) def test_series_values(using_copy_on_write, method): ser = Series([1, 2, 3], name="name") @@ -45,8 +51,12 @@ def test_series_values(using_copy_on_write, method): @pytest.mark.parametrize( "method", - [lambda df: df.values, lambda df: np.asarray(df)], - ids=["values", "asarray"], + [ + lambda df: df.values, + lambda df: np.asarray(df), + lambda ser: np.array(ser, copy=False), + ], + ids=["values", "asarray", "array"], ) def test_dataframe_values(using_copy_on_write, using_array_manager, method): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -100,7 +110,7 @@ def test_series_to_numpy(using_copy_on_write): arr[0] = 0 assert ser.iloc[0] == 0 - # specify copy=False gives a writeable array + # specify copy=True gives a writeable array ser = Series([1, 2, 3], name="name") arr = ser.to_numpy(copy=True) assert not np.shares_memory(arr, get_array(ser, "name")) @@ -174,6 +184,23 @@ def test_dataframe_multiple_numpy_dtypes(): assert not np.shares_memory(arr, get_array(df, "a")) assert arr.flags.writeable is True + if np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + arr = np.array(df, copy=False) + + arr = np.array(df, copy=True) + assert arr.flags.writeable is True + + +def test_dataframe_single_block_copy_true(): + # the copy=False/None cases are tested above in test_dataframe_values + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + arr = np.array(df, copy=True) + assert not np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is True + def test_values_is_ea(using_copy_on_write): df = DataFrame({"a": date_range("2012-01-01", periods=3)}) From 168e3533d916d7a3505cc702687d9744e5488fe4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Nov 2024 15:31:36 +0100 Subject: [PATCH 313/396] [backport 2.3.x] TST (string dtype): avoid explicit object dtype Index in fixture data (#60217) (#60225) (cherry picked from commit 4b04a2f0043ad04b5546750a8947dfeef68cdb75) --- pandas/_testing/__init__.py | 2 ++ pandas/conftest.py | 10 +++++----- pandas/tests/frame/methods/test_align.py | 3 --- pandas/tests/frame/test_reductions.py | 1 - pandas/tests/series/indexing/test_setitem.py | 1 - pandas/tests/series/methods/test_reindex.py | 2 +- pandas/tests/series/methods/test_to_csv.py | 3 --- 7 files changed, 8 insertions(+), 14 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 3aa7c64831efe..2d066b581f1c6 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -519,6 +519,8 @@ def shares_memory(left, right) -> bool: if isinstance(left, MultiIndex): return shares_memory(left._codes, right) if isinstance(left, (Index, Series)): + if isinstance(right, (Index, Series)): + return shares_memory(left._values, right._values) return shares_memory(left._values, right) if isinstance(left, NDArrayBackedExtensionArray): diff --git a/pandas/conftest.py b/pandas/conftest.py index b0818b11ab037..1567708d04b20 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -548,7 +548,7 @@ def multiindex_year_month_day_dataframe_random_data(): """ tdf = DataFrame( np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100, freq="B"), ) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() @@ -743,7 +743,7 @@ def string_series() -> Series: """ return Series( np.arange(30, dtype=np.float64) * 1.1, - index=Index([f"i_{i}" for i in range(30)], dtype=object), + index=Index([f"i_{i}" for i in range(30)]), name="series", ) @@ -754,7 +754,7 @@ def object_series() -> Series: Fixture for Series of dtype object with Index of unique strings """ data = [f"foo_{i}" for i in range(30)] - index = Index([f"bar_{i}" for i in range(30)], dtype=object) + index = Index([f"bar_{i}" for i in range(30)]) return Series(data, index=index, name="objects", dtype=object) @@ -846,8 +846,8 @@ def int_frame() -> DataFrame: """ return DataFrame( np.ones((30, 4), dtype=np.int64), - index=Index([f"foo_{i}" for i in range(30)], dtype=object), - columns=Index(list("ABCD"), dtype=object), + index=Index([f"foo_{i}" for i in range(30)]), + columns=Index(list("ABCD")), ) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 15a97a99caa5a..5a9c47866dae8 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -157,7 +155,6 @@ def test_align_series_condition(self): expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_align_int(self, int_frame): # test other non-float types other = DataFrame(index=range(5), columns=["A", "B", "C"]) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 824d53c8d5d13..bee95e8295746 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1058,7 +1058,6 @@ def test_sum_bools(self): # ---------------------------------------------------------------------- # Index of max / min - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmin(self, float_frame, int_frame, skipna, axis): diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 0b9b1fc080cfe..c28d3c9fedbd5 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -574,7 +574,6 @@ def test_setitem_with_expansion_type_promotion(self): expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) tm.assert_series_equal(ser, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_not_contained(self, string_series): # set item that's not contained ser = string_series.copy() diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 0923a2d42ce10..ecfbecf12bdd3 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -25,7 +25,7 @@ def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) - assert np.may_share_memory(string_series.index, identity.index) + assert tm.shares_memory(string_series.index, identity.index) assert identity.index.is_(string_series.index) assert identity.index.identical(string_series.index) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index efb249fdedf3d..ba75c7786ef72 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import Series import pandas._testing as tm @@ -26,7 +24,6 @@ def read_csv(self, path, **kwargs): return out - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_csv(self, datetime_series, string_series): # freq doesn't round-trip datetime_series.index = datetime_series.index._with_freq(None) From 64f9907f0630cf7c18e88abad99f36a4fbf7bc3d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2024 08:33:24 +0100 Subject: [PATCH 314/396] [backport 2.3.x] BUG (string dtype): fix qualifier in memory usage info (#60221) (#60231) (cherry picked from commit 0937c95777d44462d67fd5b299d4563984e78332) --- pandas/core/indexes/base.py | 4 ++- pandas/core/indexes/multi.py | 9 ++++--- pandas/tests/frame/methods/test_info.py | 34 +++++++++++++++++------- pandas/tests/series/methods/test_info.py | 30 +++++++++++++-------- 4 files changed, 52 insertions(+), 25 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fc53e044a6544..5da327a82c02b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5326,7 +5326,9 @@ def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. """ - return is_object_dtype(self.dtype) + return is_object_dtype(self.dtype) or ( + is_string_dtype(self.dtype) and self.dtype.storage == "python" # type: ignore[union-attr] + ) def __contains__(self, key: Any) -> bool: """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9e002ccd3a787..7cb28214c7289 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -65,6 +65,7 @@ is_list_like, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -1344,10 +1345,12 @@ def dtype(self) -> np.dtype: def _is_memory_usage_qualified(self) -> bool: """return a boolean if we need a qualified .info display""" - def f(level) -> bool: - return "mixed" in level or "string" in level or "unicode" in level + def f(dtype) -> bool: + return is_object_dtype(dtype) or ( + is_string_dtype(dtype) and dtype.storage == "python" + ) - return any(f(level) for level in self._inferred_type_levels) + return any(f(level.dtype) for level in self.levels) # Cannot determine type of "memory_usage" @doc(Index.memory_usage) # type: ignore[has-type] diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 475632667a87a..f0ae00fa6febb 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -10,6 +10,7 @@ from pandas._config import using_string_dtype from pandas.compat import ( + HAS_PYARROW, IS64, PYPY, ) @@ -435,18 +436,25 @@ def test_usage_via_getsizeof(): assert abs(diff) < 100 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_info_memory_usage_qualified(): +def test_info_memory_usage_qualified(using_infer_string): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=list("ABC")) + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype=object)) df.info(buf=buf) assert "+" in buf.getvalue() + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype="str")) + df.info(buf=buf) + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() + buf = StringIO() df = DataFrame( 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) @@ -459,7 +467,10 @@ def test_info_memory_usage_qualified(): 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) ) df.info(buf=buf) - assert "+" in buf.getvalue() + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() def test_info_memory_usage_bug_on_multiindex(): @@ -496,16 +507,15 @@ def test_info_categorical(): df.info(buf=buf) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") -def test_info_int_columns(): +def test_info_int_columns(using_infer_string): # GH#37245 df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) buf = StringIO() df.info(show_counts=True, buf=buf) result = buf.getvalue() expected = textwrap.dedent( - """\ + f"""\ Index: 2 entries, A to B Data columns (total 2 columns): @@ -514,19 +524,23 @@ def test_info_int_columns(): 0 1 2 non-null int64 1 2 2 non-null int64 dtypes: int64(2) - memory usage: 48.0+ bytes + memory usage: {'50.0' if using_infer_string and HAS_PYARROW else '48.0+'} bytes """ ) assert result == expected @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_memory_usage_empty_no_warning(): +def test_memory_usage_empty_no_warning(using_infer_string): # GH#50066 df = DataFrame(index=["a", "b"]) with tm.assert_produces_warning(None): result = df.memory_usage() - expected = Series(16 if IS64 else 8, index=["Index"]) + if using_infer_string and HAS_PYARROW: + value = 18 + else: + value = 16 if IS64 else 8 + expected = Series(value, index=["Index"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index 8fac40fe5fb25..7defad8a463f3 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -7,10 +7,14 @@ from pandas._config import using_string_dtype -from pandas.compat import PYPY +from pandas.compat import ( + HAS_PYARROW, + PYPY, +) from pandas import ( CategoricalIndex, + Index, MultiIndex, Series, date_range, @@ -41,7 +45,9 @@ def test_info_categorical(): @pytest.mark.parametrize("verbose", [True, False]) -def test_info_series(lexsorted_two_level_string_multiindex, verbose): +def test_info_series( + lexsorted_two_level_string_multiindex, verbose, using_infer_string +): index = lexsorted_two_level_string_multiindex ser = Series(range(len(index)), index=index, name="sth") buf = StringIO() @@ -63,10 +69,11 @@ def test_info_series(lexsorted_two_level_string_multiindex, verbose): 10 non-null int64 """ ) + qualifier = "" if using_infer_string and HAS_PYARROW else "+" expected += textwrap.dedent( f"""\ dtypes: int64(1) - memory usage: {ser.memory_usage()}.0+ bytes + memory usage: {ser.memory_usage()}.0{qualifier} bytes """ ) assert result == expected @@ -142,20 +149,21 @@ def test_info_memory_usage_deep_pypy(): assert s_object.memory_usage(deep=True) == s_object.memory_usage() -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( - "series, plus", + "index, plus", [ - (Series(1, index=[1, 2, 3]), False), - (Series(1, index=list("ABC")), True), - (Series(1, index=MultiIndex.from_product([range(3), range(3)])), False), + ([1, 2, 3], False), + (Index(list("ABC"), dtype="str"), not (using_string_dtype() and HAS_PYARROW)), + (Index(list("ABC"), dtype=object), True), + (MultiIndex.from_product([range(3), range(3)]), False), ( - Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])), - True, + MultiIndex.from_product([range(3), ["foo", "bar"]]), + not (using_string_dtype() and HAS_PYARROW), ), ], ) -def test_info_memory_usage_qualified(series, plus): +def test_info_memory_usage_qualified(index, plus): + series = Series(1, index=index) buf = StringIO() series.info(buf=buf) if plus: From db68cd5ba1c3102e4e6b560f46420b6e495b4dcd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2024 09:24:35 +0100 Subject: [PATCH 315/396] [backport 2.3.x] ERR (string dtype): harmonize setitem error message for python and pyarrow storage (#60219) (#60232) (cherry picked from commit 692ea6f9d4b05187a05f0811d3241211855d6efb) --- pandas/core/arrays/arrow/array.py | 4 ++-- pandas/core/arrays/masked.py | 2 +- pandas/core/arrays/string_.py | 12 +++++++++--- pandas/core/arrays/string_arrow.py | 15 ++++++++++++--- pandas/tests/arrays/masked/test_indexing.py | 2 +- pandas/tests/arrays/string_/test_string.py | 17 ++++------------- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/frame/indexing/test_where.py | 4 ++-- pandas/tests/indexing/test_indexing.py | 4 +--- pandas/tests/indexing/test_loc.py | 2 +- pandas/tests/series/indexing/test_setitem.py | 4 ++-- 11 files changed, 36 insertions(+), 32 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8cf763265fd34..e8ce1f4526f89 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1134,7 +1134,7 @@ def fillna( try: fill_value = self._box_pa(value, pa_type=self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" raise TypeError(msg) from err try: @@ -2126,7 +2126,7 @@ def _maybe_convert_setitem_value(self, value): try: value = self._box_pa(value, self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" raise TypeError(msg) from err return value diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index ba7b8e3e7398e..0e839dc7a80bb 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -302,7 +302,7 @@ def _validate_setitem_value(self, value): # Note: without the "str" here, the f-string rendering raises in # py38 builds. - raise TypeError(f"Invalid value '{str(value)}' for dtype {self.dtype}") + raise TypeError(f"Invalid value '{value!s}' for dtype '{self.dtype}'") def __setitem__(self, key, value) -> None: key = check_array_indexer(self, key) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 4801b70a27dd4..aae9f98032eff 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -654,7 +654,8 @@ def _validate_scalar(self, value): return self.dtype.na_value elif not isinstance(value, str): raise TypeError( - f"Cannot set non-string value '{value}' into a string array." + f"Invalid value '{value}' for dtype '{self.dtype}'. Value should be a " + f"string or missing value, got '{type(value).__name__}' instead." ) return value @@ -743,7 +744,9 @@ def __setitem__(self, key, value) -> None: value = self.dtype.na_value elif not isinstance(value, str): raise TypeError( - f"Cannot set non-string value '{value}' into a StringArray." + f"Invalid value '{value}' for dtype '{self.dtype}'. Value should " + f"be a string or missing value, got '{type(value).__name__}' " + "instead." ) else: if not is_array_like(value): @@ -753,7 +756,10 @@ def __setitem__(self, key, value) -> None: # compatible, compatibility with arrow backed strings value = np.asarray(value) if len(value) and not lib.is_string_array(value, skipna=True): - raise TypeError("Must provide strings.") + raise TypeError( + "Invalid value for dtype 'str'. Value should be a " + "string or missing value (or array of those)." + ) mask = isna(value) if mask.any(): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b6e98d8fdc7e5..c15e50f698a3d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -215,7 +215,10 @@ def insert(self, loc: int, item) -> ArrowStringArray: if self.dtype.na_value is np.nan and item is np.nan: item = libmissing.NA if not isinstance(item, str) and item is not libmissing.NA: - raise TypeError("Scalar must be NA or str") + raise TypeError( + f"Invalid value '{item}' for dtype 'str'. Value should be a " + f"string or missing value, got '{type(item).__name__}' instead." + ) return super().insert(loc, item) def _convert_bool_result(self, values, na=lib.no_default, method_name=None): @@ -249,13 +252,19 @@ def _maybe_convert_setitem_value(self, value): if isna(value): value = None elif not isinstance(value, str): - raise TypeError("Scalar must be NA or str") + raise TypeError( + f"Invalid value '{value}' for dtype 'str'. Value should be a " + f"string or missing value, got '{type(value).__name__}' instead." + ) else: value = np.array(value, dtype=object, copy=True) value[isna(value)] = None for v in value: if not (v is None or isinstance(v, str)): - raise TypeError("Must provide strings") + raise TypeError( + "Invalid value for dtype 'str'. Value should be a " + "string or missing value (or array of those)." + ) return super()._maybe_convert_setitem_value(value) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: diff --git a/pandas/tests/arrays/masked/test_indexing.py b/pandas/tests/arrays/masked/test_indexing.py index 28ee451a7ddd7..753d562c87ffa 100644 --- a/pandas/tests/arrays/masked/test_indexing.py +++ b/pandas/tests/arrays/masked/test_indexing.py @@ -8,7 +8,7 @@ class TestSetitemValidation: def _check_setitem_invalid(self, arr, invalid): - msg = f"Invalid value '{str(invalid)}' for dtype {arr.dtype}" + msg = f"Invalid value '{invalid!s}' for dtype '{arr.dtype}'" msg = re.escape(msg) with pytest.raises(TypeError, match=msg): arr[0] = invalid diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e511ba62d5d09..14c02723191a8 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -108,14 +108,11 @@ def test_none_to_nan(cls, dtype): def test_setitem_validates(cls, dtype): arr = cls._from_sequence(["a", "b"], dtype=dtype) - if dtype.storage == "python": - msg = "Cannot set non-string value '10' into a StringArray." - else: - msg = "Scalar must be NA or str" + msg = "Invalid value '10' for dtype 'str" with pytest.raises(TypeError, match=msg): arr[0] = 10 - msg = "Must provide strings" + msg = "Invalid value for dtype 'str" with pytest.raises(TypeError, match=msg): arr[:] = np.array([1, 2]) @@ -510,10 +507,7 @@ def test_fillna_args(dtype): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage == "pyarrow": - msg = "Invalid value '1' for dtype str" - else: - msg = "Cannot set non-string value '1' into a StringArray." + msg = "Invalid value '1' for dtype 'str" with pytest.raises(TypeError, match=msg): arr.fillna(value=1) @@ -754,10 +748,7 @@ def test_setitem_scalar_with_mask_validation(dtype): # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) - if dtype.storage == "python": - msg = "Cannot set non-string value" - else: - msg = "Scalar must be NA or str" + msg = "Invalid value '1' for dtype 'str" with pytest.raises(TypeError, match=msg): ser[mask] = 1 diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 1721fe2c0eb8b..c0ab51a484cdf 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1340,7 +1340,7 @@ def test_setting_mismatched_na_into_nullable_fails( r"timedelta64\[ns\] cannot be converted to (Floating|Integer)Dtype", r"datetime64\[ns\] cannot be converted to (Floating|Integer)Dtype", "'values' contains non-numeric NA", - r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}", + r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'", ] ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 861147f5b58dd..5fd3796d0255a 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -976,7 +976,7 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): mask = np.array([True, True, False], ndmin=obj.ndim).T - msg = r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}" + msg = r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'" for null in tm.NP_NAT_OBJECTS + [pd.NaT]: # NaT is an NA value that we should *not* cast to pd.NA dtype @@ -1091,7 +1091,7 @@ def test_where_int_overflow(replacement, using_infer_string): df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) if using_infer_string and replacement not in (None, "snake"): with pytest.raises( - TypeError, match="Cannot set non-string value|Scalar must be NA or str" + TypeError, match=f"Invalid value '{replacement}' for dtype 'str'" ): df.where(pd.notnull(df), replacement) return diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 0ff33ba88b16f..07275302dcf9f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -728,9 +728,7 @@ def run_tests(df, rhs, right_loc, right_iloc): right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0] right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"] if using_infer_string: - with pytest.raises( - TypeError, match="Must provide strings|Scalar must be NA or str" - ): + with pytest.raises(TypeError, match="Invalid value"): with tm.assert_produces_warning( FutureWarning, match="incompatible dtype" ): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index bdc6d9aff6f4e..dc4f159cfd3c3 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1292,7 +1292,7 @@ def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string # assigning with loc/iloc attempts to set the values inplace, which # in this case is successful if using_infer_string: - with pytest.raises(TypeError, match="Must provide strings"): + with pytest.raises(TypeError, match="Invalid value"): result.loc[result.index, "A"] = [float(x) for x in col_data] else: result.loc[result.index, "A"] = [float(x) for x in col_data] diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index c28d3c9fedbd5..d3ecbfe8f6cc7 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -888,7 +888,7 @@ def test_index_where(self, obj, key, expected, warn, val, using_infer_string): mask[key] = True if using_infer_string and obj.dtype == object: - with pytest.raises(TypeError, match="Scalar must"): + with pytest.raises(TypeError, match="Invalid value"): Index(obj).where(~mask, val) else: res = Index(obj).where(~mask, val) @@ -901,7 +901,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): mask[key] = True if using_infer_string and obj.dtype == object: - with pytest.raises(TypeError, match="Scalar must"): + with pytest.raises(TypeError, match="Invalid value"): Index(obj).putmask(mask, val) else: res = Index(obj).putmask(mask, val) From cacd4bbf24b325603b81d1339505c4deccec7701 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2024 12:58:17 +0100 Subject: [PATCH 316/396] [backport 2.3.x] TST (string dtype): resolve xfails in pandas/tests/series (#60233) (#60240) (cherry picked from commit 3f7bc81ae6839803ecc0da073fe83e9194759550) --- .../series/accessors/test_dt_accessor.py | 4 -- pandas/tests/series/indexing/test_indexing.py | 21 +++++++--- pandas/tests/series/indexing/test_setitem.py | 38 +++++++++---------- pandas/tests/series/indexing/test_where.py | 17 ++++----- pandas/tests/series/methods/test_replace.py | 34 +++++++++++------ pandas/tests/series/methods/test_unstack.py | 5 +-- pandas/tests/series/test_logical_ops.py | 1 + 7 files changed, 64 insertions(+), 56 deletions(-) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 18ee81581bdc3..a06a3a0d40675 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -11,8 +11,6 @@ import pytest import pytz -from pandas._config import using_string_dtype - from pandas._libs.tslibs.timezones import maybe_get_tz from pandas.errors import SettingWithCopyError @@ -571,7 +569,6 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_days(self): ser = Series(date_range("20130101", periods=5)) ser.iloc[0] = pd.NaT @@ -586,7 +583,6 @@ def test_strftime_dt64_days(self): expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], - dtype=np.object_, ) # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index a26e541732d36..9ab7dff64b182 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas import ( @@ -270,18 +268,29 @@ def test_slice(string_series, object_series, using_copy_on_write, warn_copy_on_w assert (string_series[10:20] == 0).all() -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) s.loc["B"] = timedelta(1) - tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"])) + expected = Series( + Timedelta("1 days"), dtype="timedelta64[ns]", index=Index(["B"], dtype=object) + ) + tm.assert_series_equal(s, expected) s = s.reindex(s.index.insert(0, "A")) - tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) + expected = Series( + [np.nan, Timedelta("1 days")], + dtype="timedelta64[ns]", + index=Index(["A", "B"], dtype=object), + ) + tm.assert_series_equal(s, expected) s.loc["A"] = timedelta(1) - expected = Series(Timedelta("1 days"), index=["A", "B"]) + expected = Series( + Timedelta("1 days"), + dtype="timedelta64[ns]", + index=Index(["A", "B"], dtype=object), + ) tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index d3ecbfe8f6cc7..d95ee99489076 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -8,9 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import ( np_version_gt2, np_version_gte1p24, @@ -37,6 +34,7 @@ concat, date_range, interval_range, + isna, period_range, timedelta_range, ) @@ -564,14 +562,16 @@ def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): tm.assert_series_equal(ser, expected) assert isinstance(ser["td"], Timedelta) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_with_expansion_type_promotion(self): # GH#12599 ser = Series(dtype=object) ser["a"] = Timestamp("2016-01-01") ser["b"] = 3.0 ser["c"] = "foo" - expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) + expected = Series( + [Timestamp("2016-01-01"), 3.0, "foo"], + index=Index(["a", "b", "c"], dtype=object), + ) tm.assert_series_equal(ser, expected) def test_setitem_not_contained(self, string_series): @@ -850,11 +850,6 @@ def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): indexer_sli(obj)[mask] = val tm.assert_series_equal(obj, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_series_where(self, obj, key, expected, warn, val, is_inplace): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -870,6 +865,11 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): obj = obj.copy() arr = obj._values + if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): + with pytest.raises(TypeError, match="Invalid value"): + obj.where(~mask, val) + return + res = obj.where(~mask, val) if val is NA and res.dtype == object: @@ -882,29 +882,27 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - def test_index_where(self, obj, key, expected, warn, val, using_infer_string): + def test_index_where(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if using_infer_string and obj.dtype == object: + if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): with pytest.raises(TypeError, match="Invalid value"): - Index(obj).where(~mask, val) + Index(obj, dtype=obj.dtype).where(~mask, val) else: - res = Index(obj).where(~mask, val) + res = Index(obj, dtype=obj.dtype).where(~mask, val) expected_idx = Index(expected, dtype=expected.dtype) tm.assert_index_equal(res, expected_idx) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): + def test_index_putmask(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if using_infer_string and obj.dtype == object: + if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): with pytest.raises(TypeError, match="Invalid value"): - Index(obj).putmask(mask, val) + Index(obj, dtype=obj.dtype).putmask(mask, val) else: - res = Index(obj).putmask(mask, val) + res = Index(obj, dtype=obj.dtype).putmask(mask, val) tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 013386202c966..0fa2f63e5fb36 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_integer import pandas as pd @@ -232,7 +230,6 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) -@pytest.mark.xfail(using_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment @@ -242,7 +239,7 @@ def test_where_setitem_invalid(): "different length than the value" ) # slice - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[0:3] = list(range(27)) @@ -252,18 +249,18 @@ def test_where_setitem_invalid(): tm.assert_series_equal(s.astype(np.int64), expected) # slice with step - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[0:4:2] = list(range(27)) - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) s[0:4:2] = list(range(2)) expected = Series([0, "b", 1, "d", "e", "f"]) tm.assert_series_equal(s, expected) # neg slices - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[:-1] = list(range(27)) @@ -273,18 +270,18 @@ def test_where_setitem_invalid(): tm.assert_series_equal(s, expected) # list - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(27)) - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(2)) # scalar - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) s[0] = list(range(10)) expected = Series([list(range(10)), "b", "c"]) tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index c59dbc4ed95d7..79a66526a0004 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -391,7 +391,6 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "categorical, numeric", [ @@ -399,11 +398,15 @@ def test_replace_mixed_types_with_string(self): (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), ], ) - def test_replace_categorical(self, categorical, numeric): + def test_replace_categorical(self, categorical, numeric, using_infer_string): # GH 24971, GH#23305 ser = pd.Series(categorical) msg = "Downcasting behavior in `replace`" msg = "with CategoricalDtype is deprecated" + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + ser.replace({"A": 1, "B": 2}) + return with tm.assert_produces_warning(FutureWarning, match=msg): result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") @@ -731,17 +734,25 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 - series = pd.Series(["0"]) + series = pd.Series(["0"], dtype=object) expected = pd.Series([1]) msg = "Downcasting behavior in `replace`" with tm.assert_produces_warning(FutureWarning, match=msg): result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("regex", [False, True]) + def test_replace_regex_dtype_series_string(self, regex, using_infer_string): + if not using_infer_string: + # then this is object dtype which is already tested above + return + series = pd.Series(["0"], dtype="str") + with pytest.raises(TypeError, match="Invalid value"): + series.replace(to_replace="0", value=1, regex=regex) + def test_replace_different_int_types(self, any_int_numpy_dtype): # GH#45311 labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype) @@ -761,20 +772,19 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - def test_replace_change_dtype_series(self, using_infer_string): + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + def test_replace_change_dtype_series(self): # GH#25797 - df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) - warn = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warn, match="Downcasting"): - df["Test"] = df["Test"].replace([True], [np.nan]) - expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object) + df["Test"] = df["Test"].replace([True], [np.nan]) + expected = pd.DataFrame({"Test": ["0.5", np.nan, "0.6"]}, dtype=object) tm.assert_frame_equal(df, expected) - df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object) df["Test"] = df["Test"].replace([None], [np.nan]) tm.assert_frame_equal(df, expected) - df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object) df["Test"] = df["Test"].fillna(np.nan) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index 8569e0f49716a..11995260dd0be 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -136,11 +134,10 @@ def test_unstack_mixed_type_name_in_multiindex( tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_unstack_multi_index_categorical_values(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) mi = df.stack(future_stack=True).index.rename(["major", "minor"]) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 26bdfcbc6ec56..8d7adc1c1aae6 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -431,6 +431,7 @@ def test_logical_ops_label_based(self, using_infer_string): for e in [Series(["z"])]: if using_infer_string: # TODO(infer_string) should this behave differently? + # -> https://github.com/pandas-dev/pandas/issues/60234 with pytest.raises( TypeError, match="not supported for dtype|unsupported operand type" ): From 9465bf12b3643f3144944e7ecdbf2aa21cdfbaa5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2024 19:09:01 +0100 Subject: [PATCH 317/396] [2.3.x] CI: skip lxml encode test on Windows (#60238) CI: skip lxml encode test on Windows --- pandas/tests/io/test_html.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 826c0a1ca7cf9..b12098d4904c1 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1381,6 +1381,7 @@ def test_displayed_only_with_many_elements(self, displayed_only, flavor_read_htm expected = DataFrame({"A": [1, 4], "B": [2, 5]}) tm.assert_frame_equal(result, expected) + @td.skip_if_windows() @pytest.mark.filterwarnings( "ignore:You provided Unicode markup but also provided a value for " "from_encoding.*:UserWarning" From 45275859b1d59b4193f5c319410d13dbe2be6233 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 8 Nov 2024 10:44:41 -0800 Subject: [PATCH 318/396] Backport PR #60241 on branch 2.3.x (TST (string dtype): resolve xfail in arrow interface tests) (#60248) Backport PR #60241: TST (string dtype): resolve xfail in arrow interface tests Co-authored-by: Joris Van den Bossche --- pandas/tests/frame/test_arrow_interface.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index dc163268f64b9..b36b6b5ffe0cc 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -2,8 +2,6 @@ import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -11,9 +9,8 @@ pa = pytest.importorskip("pyarrow") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="14.0") -def test_dataframe_arrow_interface(): +def test_dataframe_arrow_interface(using_infer_string): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) capsule = df.__arrow_c_stream__() @@ -25,7 +22,8 @@ def test_dataframe_arrow_interface(): ) table = pa.table(df) - expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + string_type = pa.large_string() if using_infer_string else pa.string() + expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)}) assert table.equals(expected) schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) @@ -34,13 +32,13 @@ def test_dataframe_arrow_interface(): assert table.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="15.0") -def test_dataframe_to_arrow(): +def test_dataframe_to_arrow(using_infer_string): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) table = pa.RecordBatchReader.from_stream(df).read_all() - expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + string_type = pa.large_string() if using_infer_string else pa.string() + expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)}) assert table.equals(expected) schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) From 2c89984d9f7f2aa077d68fd11c12bf2636a3c16a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 9 Nov 2024 00:34:34 +0100 Subject: [PATCH 319/396] [backport 2.3.x] BUG (string dtype): correctly enable idxmin/max for python-storage strings (#60242) (#60256) BUG (string dtype): correctly enable idxmin/max for python-storage strings (#60242) (cherry picked from commit 754d09163ae08f2b87daa41f2263556dbb809616) --- pandas/core/arrays/string_.py | 2 +- pandas/tests/frame/test_reductions.py | 6 ------ pandas/tests/reductions/test_reductions.py | 8 -------- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index aae9f98032eff..7e8726f96f90a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -846,7 +846,7 @@ def _reduce( else: return nanops.nanall(self._ndarray, skipna=skipna) - if name in ["min", "max", "sum"]: + if name in ["min", "max", "argmin", "argmax", "sum"]: result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs) if keepdims: return self._from_sequence([result], dtype=self.dtype) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index bee95e8295746..84d56864b3219 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( IS64, is_platform_windows, @@ -1091,7 +1089,6 @@ def test_idxmin_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmin_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) @@ -1108,8 +1105,6 @@ def test_idxmin_axis_2(self, float_frame): with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax(self, float_frame, int_frame, skipna, axis): frame = float_frame @@ -1142,7 +1137,6 @@ def test_idxmax_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmax_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index ed2b01b09bb71..496b00a0547b7 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -7,10 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import ( Categorical, @@ -1246,10 +1242,6 @@ def test_idxminmax_object_dtype(self, using_infer_string): with pytest.raises(TypeError, match=msg): ser3.idxmin(skipna=False) - # TODO(infer_string) implement argmin/max for python string dtype - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_idxminmax_object_frame(self): # GH#4279 df = DataFrame([["zimm", 2.5], ["biff", 1.0], ["bid", 12.0]]) From 08086572f2b23c9dba1e881b4ecd16a55392ee04 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 9 Nov 2024 19:26:07 +0100 Subject: [PATCH 320/396] [backport 2.3.x] TST (string dtype): fix groupby xfails with using_infer_string + update error message (#59430) (#60246) * TST (string dtype): fix groupby xfails with using_infer_string + update error message (#59430) Co-authored-by: Joris Van den Bossche (cherry picked from commit e5dd89d4d74d8e2a06256023717880788f2b10ed) * fix test --------- Co-authored-by: jbrockmendel --- pandas/core/arrays/arrow/array.py | 14 +++++ pandas/core/arrays/base.py | 14 +++++ pandas/core/groupby/groupby.py | 4 +- pandas/tests/frame/test_stack_unstack.py | 4 +- pandas/tests/groupby/aggregate/test_cython.py | 4 +- pandas/tests/groupby/methods/test_quantile.py | 9 ++- pandas/tests/groupby/test_groupby.py | 56 ++++++++++++++----- pandas/tests/groupby/test_groupby_subclass.py | 2 +- pandas/tests/groupby/test_numeric_only.py | 20 +++++-- pandas/tests/groupby/test_raises.py | 50 +++++++++++++++-- .../tests/groupby/transform/test_transform.py | 1 + pandas/tests/resample/test_resample_api.py | 20 ++++++- pandas/tests/reshape/merge/test_join.py | 4 +- pandas/tests/reshape/test_pivot.py | 8 ++- 14 files changed, 167 insertions(+), 43 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e8ce1f4526f89..13e10c8d3a738 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2285,6 +2285,20 @@ def _groupby_op( **kwargs, ): if isinstance(self.dtype, StringDtype): + if how in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + ]: + raise TypeError( + f"dtype '{self.dtype}' does not support operation '{how}'" + ) return super()._groupby_op( how=how, has_dropped_na=has_dropped_na, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index abfe2369b0d0d..62ca2a45fb941 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2369,6 +2369,20 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray + if op.how in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + ]: + raise TypeError( + f"dtype '{self.dtype}' does not support operation '{how}'" + ) if op.how not in ["any", "all"]: # Fail early to avoid conversion to object op._get_cython_function(op.kind, op.how, np.dtype(object), False) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 296a601288f9d..c8e2ccc7bdaeb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4394,9 +4394,9 @@ def quantile( starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: - if is_object_dtype(vals.dtype): + if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype): raise TypeError( - "'quantile' cannot be performed against 'object' dtypes!" + f"dtype '{vals.dtype}' does not support operation 'quantile'" ) inference: DtypeObj | None = None diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 8bb5eb2d5c57a..af84ee021252f 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -2083,7 +2083,7 @@ def test_unstack_period_frame(self): @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) - def test_stack_multiple_bug(self, future_stack): + def test_stack_multiple_bug(self, future_stack, using_infer_string): # bug when some uniques are not present in the data GH#3170 id_col = ([1] * 3) + ([2] * 3) name = (["a"] * 3) + (["b"] * 3) @@ -2095,6 +2095,8 @@ def test_stack_multiple_bug(self, future_stack): multi.columns.name = "Params" unst = multi.unstack("ID") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index fa8a6cb4120b2..2990fb5949242 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -166,14 +166,14 @@ def test_cython_agg_return_dict(): def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) - ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) + ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr) grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() msg = "using SeriesGroupBy.sum" with tm.assert_produces_warning(FutureWarning, match=msg): # GH#53425 - expected = grouped.agg(np.sum) + expected = grouped.agg(np.sum).astype(object) tm.assert_series_equal(summed, expected) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index d3bc815402ade..4269b41a0871b 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -174,7 +174,8 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): + msg = "dtype 'object' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("key").quantile() @@ -253,7 +254,6 @@ def test_groupby_quantile_nullable_array(values, q): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): @@ -263,9 +263,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): expected = df.groupby("a")[["b"]].quantile(q) tm.assert_frame_equal(result, expected) else: - with pytest.raises( - TypeError, match="'quantile' cannot be performed against 'object' dtypes!" - ): + msg = "dtype '.*' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("a").quantile(q, numeric_only=numeric_only) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 13269ea9c0920..3e2d15ede3648 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -640,7 +640,7 @@ def test_frame_multi_key_function_list(): tm.assert_frame_equal(agged, expected) -def test_frame_multi_key_function_list_partial_failure(): +def test_frame_multi_key_function_list_partial_failure(using_infer_string): data = DataFrame( { "A": [ @@ -691,6 +691,8 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -981,9 +983,11 @@ def test_groupby_multi_corner(df): tm.assert_frame_equal(agged, expected) -def test_raises_on_nuisance(df): +def test_raises_on_nuisance(df, using_infer_string): grouped = df.groupby("A") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1026,7 +1030,7 @@ def test_keep_nuisance_agg(df, agg_function): ["sum", "mean", "prod", "std", "var", "sem", "median"], ) @pytest.mark.parametrize("numeric_only", [True, False]) -def test_omit_nuisance_agg(df, agg_function, numeric_only): +def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string): # GH 38774, GH 38815 grouped = df.groupby("A") @@ -1034,7 +1038,10 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): if agg_function in no_drop_nuisance and not numeric_only: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False - if agg_function in ("std", "sem"): + if using_infer_string: + msg = f"dtype 'str' does not support operation '{agg_function}'" + klass = TypeError + elif agg_function in ("std", "sem"): klass = ValueError msg = "could not convert string to float: 'one'" else: @@ -1055,16 +1062,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): tm.assert_frame_equal(result, expected) -def test_raise_on_nuisance_python_single(df): +def test_raise_on_nuisance_python_single(df, using_infer_string): # GH 38815 grouped = df.groupby("A") - with pytest.raises(ValueError, match="could not convert"): + + err = ValueError + msg = "could not convert" + if using_infer_string: + err = TypeError + msg = "dtype 'str' does not support operation 'skew'" + with pytest.raises(err, match=msg): grouped.skew() -def test_raise_on_nuisance_python_multiple(three_group): +def test_raise_on_nuisance_python_multiple(three_group, using_infer_string): grouped = three_group.groupby(["A", "B"]) msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1102,12 +1117,16 @@ def test_nonsense_func(): df.groupby(lambda x: x + "foo") -def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): +def test_wrap_aggregated_output_multindex( + multiindex_dataframe_random_data, using_infer_string +): df = multiindex_dataframe_random_data.T df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -1299,8 +1318,10 @@ def test_groupby_with_hier_columns(): def test_grouping_ndarray(df): grouped = df.groupby(df["A"].values) + grouped2 = df.groupby(df["A"].rename(None)) + result = grouped.sum() - expected = df.groupby(df["A"].rename(None)).sum() + expected = grouped2.sum() tm.assert_frame_equal(result, expected) @@ -1793,8 +1814,8 @@ def test_no_dummy_key_names(df): result = df.groupby(df["A"].values).sum() assert result.index.name is None - result = df.groupby([df["A"].values, df["B"].values]).sum() - assert result.index.names == (None, None) + result2 = df.groupby([df["A"].values, df["B"].values]).sum() + assert result2.index.names == (None, None) def test_groupby_sort_multiindex_series(): @@ -2099,6 +2120,7 @@ def get_categorical_invalid_expected(): is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) + is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype) if ( isinstance(values, Categorical) @@ -2123,13 +2145,15 @@ def get_categorical_invalid_expected(): if op in ["prod", "sum", "skew"]: # ops that require more than just ordered-ness - if is_dt64 or is_cat or is_per: + if is_dt64 or is_cat or is_per or (is_str and op != "sum"): # GH#41291 # datetime64 -> prod and sum are invalid if is_dt64: msg = "datetime64 type does not support" elif is_per: msg = "Period type does not support" + elif is_str: + msg = f"dtype 'str' does not support operation '{op}'" else: msg = "category type does not support" if op == "skew": @@ -3083,7 +3107,7 @@ def test_obj_with_exclusions_duplicate_columns(): def test_groupby_numeric_only_std_no_result(numeric_only): # GH 51080 dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}] - df = DataFrame(dicts_non_numeric) + df = DataFrame(dicts_non_numeric, dtype=object) dfgb = df.groupby("a", as_index=False, sort=False) if numeric_only: @@ -3142,10 +3166,14 @@ def test_grouping_with_categorical_interval_columns(): def test_groupby_sum_on_nan_should_return_nan(bug_var): # GH 24196 df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]}) + if isinstance(bug_var, str): + df = df.astype(object) dfgb = df.groupby(lambda x: x) result = dfgb.sum(min_count=1) - expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"]) + expected_df = DataFrame( + [bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype + ) tm.assert_frame_equal(result, expected_df) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 1a2acb658ee26..b5523592c3c5c 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -109,7 +109,7 @@ def test_groupby_resample_preserves_subclass(obj): df = obj( { - "Buyer": "Carl Carl Carl Carl Joe Carl".split(), + "Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object), "Quantity": [18, 3, 5, 1, 9, 3], "Date": [ datetime(2013, 9, 1, 13, 0), diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 3b7614347d181..3c1ed20ddcb16 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -29,7 +29,8 @@ def df(self): "group": [1, 1, 2], "int": [1, 2, 3], "float": [4.0, 5.0, 6.0], - "string": list("abc"), + "string": Series(["a", "b", "c"], dtype="str"), + "object": Series(["a", "b", "c"], dtype=object), "category_string": Series(list("abc")).astype("category"), "category_int": [7, 8, 9], "datetime": date_range("20130101", periods=3), @@ -41,6 +42,7 @@ def df(self): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -113,6 +115,7 @@ def test_first_last(self, df, method): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -160,7 +163,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): # object dtypes for transformations are not implemented in Cython and # have no Python fallback - exception = NotImplementedError if method.startswith("cum") else TypeError + exception = ( + (NotImplementedError, TypeError) if method.startswith("cum") else TypeError + ) if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): # The methods default to numeric_only=False and raise TypeError @@ -171,6 +176,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): re.escape(f"agg function failed [how->{method},dtype->object]"), # cumsum/cummin/cummax/cumprod "function is not implemented for this dtype", + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -181,7 +187,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), - re.escape(f"agg function failed [how->{method},dtype->str]"), + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -199,7 +205,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), - re.escape(f"agg function failed [how->{method},dtype->str]"), + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -384,7 +390,9 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): re.escape(f"agg function failed [how->{kernel},dtype->object]"), ] ) - if kernel == "idxmin": + if kernel == "quantile": + msg = "dtype 'object' does not support operation 'quantile'" + elif kernel == "idxmin": msg = "'<' not supported between instances of 'type' and 'type'" elif kernel == "idxmax": msg = "'>' not supported between instances of 'type' and 'type'" @@ -458,7 +466,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): # that succeed should not be allowed to fail (without deprecation, at least) if groupby_func in fails_on_numeric_object and dtype is object: if groupby_func == "quantile": - msg = "cannot be performed against 'object' dtypes" + msg = "dtype 'object' does not support operation 'quantile'" else: msg = "is not supported for object dtype" warn = FutureWarning if groupby_func == "fillna" else None diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 4ebb26b0289ec..5457f5ba050c6 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( Categorical, DataFrame, @@ -119,10 +117,9 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): gb.transform(groupby_func, *args) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( - how, by, groupby_series, groupby_func, df_with_string_col + how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string ): df = df_with_string_col args = get_groupby_method_args(groupby_func, df) @@ -182,7 +179,7 @@ def test_groupby_raises_string( TypeError, re.escape("agg function failed [how->prod,dtype->object]"), ), - "quantile": (TypeError, "cannot be performed against 'object' dtypes!"), + "quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"), "rank": (None, ""), "sem": (ValueError, "could not convert string to float"), "shift": (None, ""), @@ -196,6 +193,37 @@ def test_groupby_raises_string( ), }[groupby_func] + if using_infer_string: + if groupby_func in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + "quantile", + ]: + msg = f"dtype 'str' does not support operation '{groupby_func}'" + if groupby_func in ["sem", "std", "skew"]: + # The object-dtype raises ValueError when trying to convert to numeric. + klass = TypeError + elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'" + elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'" + + elif groupby_func in ["cummin", "cummax"]: + msg = msg.replace("object", "str") + elif groupby_func == "corrwith": + msg = "Cannot perform reduction 'mean' with string dtype" + if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" @@ -222,7 +250,12 @@ def func(x): @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( - how, by, groupby_series, groupby_func_np, df_with_string_col + how, + by, + groupby_series, + groupby_func_np, + df_with_string_col, + using_infer_string, ): # GH#50749 df = df_with_string_col @@ -239,6 +272,11 @@ def test_groupby_raises_string_np( ), }[groupby_func_np] + if using_infer_string: + if groupby_func_np is np.mean: + klass = TypeError + msg = "dtype 'str' does not support operation 'mean'" + if groupby_series: warn_msg = "using SeriesGroupBy.[sum|mean]" else: diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 2aada753e27f4..a516af7e15943 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -905,6 +905,7 @@ def test_cython_transform_frame_column( ".* is not supported for object dtype", "is not implemented for this dtype", ".* is not supported for str dtype", + "dtype 'str' does not support operation '.*'", ] ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index af4cf5d4ebae5..74d06117cbb4a 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -188,7 +188,7 @@ def test_api_compat_before_use(attr): getattr(rs, attr) -def tests_raises_on_nuisance(test_frame): +def tests_raises_on_nuisance(test_frame, using_infer_string): df = test_frame df["D"] = "foo" r = df.resample("h") @@ -198,6 +198,8 @@ def tests_raises_on_nuisance(test_frame): expected = r[["A", "B", "C"]].mean() msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): r.mean() result = r.mean(numeric_only=True) @@ -932,7 +934,9 @@ def test_end_and_end_day_origin( ("sem", lib.no_default, "could not convert string to float"), ], ) -def test_frame_downsample_method(method, numeric_only, expected_data): +def test_frame_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for DataFrameGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -949,6 +953,11 @@ def test_frame_downsample_method(method, numeric_only, expected_data): if method in ("var", "mean", "median", "prod"): klass = TypeError msg = re.escape(f"agg function failed [how->{method},dtype->") + if using_infer_string: + msg = f"dtype 'str' does not support operation '{method}'" + elif method in ["sum", "std", "sem"] and using_infer_string: + klass = TypeError + msg = f"dtype 'str' does not support operation '{method}'" else: klass = ValueError msg = expected_data @@ -983,7 +992,9 @@ def test_frame_downsample_method(method, numeric_only, expected_data): ("last", lib.no_default, ["cat_2"]), ], ) -def test_series_downsample_method(method, numeric_only, expected_data): +def test_series_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for SeriesGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -999,8 +1010,11 @@ def test_series_downsample_method(method, numeric_only, expected_data): func(**kwargs) elif method == "prod": msg = re.escape("agg function failed [how->prod,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'prod'" with pytest.raises(TypeError, match=msg): func(**kwargs) + else: result = func(**kwargs) expected = Series(expected_data, index=expected_index) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 9188521c71158..4b79860437f72 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -625,7 +625,7 @@ def test_join_non_unique_period_index(self): ) tm.assert_frame_equal(result, expected) - def test_mixed_type_join_with_suffix(self): + def test_mixed_type_join_with_suffix(self, using_infer_string): # GH #916 df = DataFrame( np.random.default_rng(2).standard_normal((20, 6)), @@ -636,6 +636,8 @@ def test_mixed_type_join_with_suffix(self): grouped = df.groupby("id") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d0858a0ea5558..75268ccee1d8c 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -948,12 +948,14 @@ def test_margins(self, data): for value_col in table.columns.levels[0]: self._check_output(table[value_col], value_col, data) - def test_no_col(self, data): + def test_no_col(self, data, using_infer_string): # no col # to help with a buglet data.columns = [k * 2 for k in data.columns] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") table = data.drop(columns="CC").pivot_table( @@ -1003,7 +1005,7 @@ def test_no_col(self, data): ], ) def test_margin_with_only_columns_defined( - self, columns, aggfunc, values, expected_columns + self, columns, aggfunc, values, expected_columns, using_infer_string ): # GH 31016 df = DataFrame( @@ -1027,6 +1029,8 @@ def test_margin_with_only_columns_defined( ) if aggfunc != "sum": msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) if "B" not in columns: From 2054463668ad130c3dfed1e789ed49dd49774409 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Nov 2024 18:40:09 +0100 Subject: [PATCH 321/396] [backport 2.3.x] ENH (string dtype): convert string_view columns to future string dtype instead of object dtype in Parquet/Feather IO (#60235) (#60291) (cherry picked from commit f307a0a3615d93c2177f6581133bdb541e12a93c) --- pandas/compat/__init__.py | 2 ++ pandas/compat/pyarrow.py | 2 ++ pandas/io/_util.py | 9 +++++++-- pandas/tests/io/test_feather.py | 20 ++++++++++++++++++++ 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 38fb0188df5ff..5e82853109015 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -33,6 +33,7 @@ pa_version_under14p1, pa_version_under16p0, pa_version_under17p0, + pa_version_under18p0, ) if TYPE_CHECKING: @@ -191,6 +192,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under14p1", "pa_version_under16p0", "pa_version_under17p0", + "pa_version_under18p0", "HAS_PYARROW", "IS64", "ISMUSL", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 7fa197c4a9824..f579b8a45d386 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -17,6 +17,7 @@ pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") + pa_version_under18p0 = _palv < Version("18.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -28,4 +29,5 @@ pa_version_under15p0 = True pa_version_under16p0 = True pa_version_under17p0 = True + pa_version_under18p0 = False HAS_PYARROW = False diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 50a97f1059b5c..f3e6dba1391be 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -4,6 +4,7 @@ import numpy as np +from pandas.compat import pa_version_under18p0 from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -32,7 +33,11 @@ def _arrow_dtype_mapping() -> dict: def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return { + mapping = { pa.string(): pd.StringDtype(na_value=np.nan), pa.large_string(): pd.StringDtype(na_value=np.nan), - }.get + } + if not pa_version_under18p0: + mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) + + return mapping.get diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 3b4484e44e155..58a5f78ce3258 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,6 +2,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under18p0 + import pandas as pd import pandas._testing as tm @@ -250,3 +252,21 @@ def test_string_inference(self, tmp_path): data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) ) tm.assert_frame_equal(result, expected) + + @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") + def test_string_inference_string_view_type(self, tmp_path): + # GH#54798 + import pyarrow as pa + from pyarrow import feather + + path = tmp_path / "string_view.parquet" + table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())}) + feather.write_feather(table, path) + + with pd.option_context("future.infer_string", True): + result = read_feather(path) + + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) From 54b47df88199741dac449fd0d2060e7dfdf8dd7e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2024 18:22:39 +0100 Subject: [PATCH 322/396] [backport 2.3.x] BUG (string dtype): replace with non-string to fall back to object dtype (#60285) (#60292) * BUG (string dtype): replace with non-string to fall back to object dtype (#60285) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> (cherry picked from commit 938832ba325c6efc1710e002c0d3d4d9b3a6c8ba) * updates for 2.3 * fix inplace modification for 2.3.x branch with python storage --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/string_.py | 43 ++++++++++++-------- pandas/core/dtypes/cast.py | 7 ++++ pandas/core/internals/blocks.py | 40 ++++++++++++++---- pandas/tests/frame/methods/test_replace.py | 9 ---- pandas/tests/series/indexing/test_setitem.py | 18 +++----- pandas/tests/series/methods/test_replace.py | 16 +++----- 7 files changed, 76 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 0751554d87dc8..3e699e1a27b55 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -107,10 +107,10 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) +- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) -- Interval ^^^^^^^^ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7e8726f96f90a..e163a9df8ee10 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -726,20 +726,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: return arr, self.dtype.na_value - def __setitem__(self, key, value) -> None: - value = extract_array(value, extract_numpy=True) - if isinstance(value, type(self)): - # extract_array doesn't extract NumpyExtensionArray subclasses - value = value._ndarray - - key = check_array_indexer(self, key) - scalar_key = lib.is_scalar(key) - scalar_value = lib.is_scalar(value) - if scalar_key and not scalar_value: - raise ValueError("setting an array element with a sequence.") - - # validate new items - if scalar_value: + def _maybe_convert_setitem_value(self, value): + """Maybe convert value to be pyarrow compatible.""" + if lib.is_scalar(value): if isna(value): value = self.dtype.na_value elif not isinstance(value, str): @@ -749,8 +738,11 @@ def __setitem__(self, key, value) -> None: "instead." ) else: + value = extract_array(value, extract_numpy=True) if not is_array_like(value): value = np.asarray(value, dtype=object) + elif isinstance(value.dtype, type(self.dtype)): + return value else: # cast categories and friends to arrays to see if values are # compatible, compatibility with arrow backed strings @@ -760,11 +752,26 @@ def __setitem__(self, key, value) -> None: "Invalid value for dtype 'str'. Value should be a " "string or missing value (or array of those)." ) + return value - mask = isna(value) - if mask.any(): - value = value.copy() - value[isna(value)] = self.dtype.na_value + def __setitem__(self, key, value) -> None: + value = self._maybe_convert_setitem_value(value) + + key = check_array_indexer(self, key) + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") + + if not scalar_value: + if value.dtype == self.dtype: + value = value._ndarray + else: + value = np.asarray(value) + mask = isna(value) + if mask.any(): + value = value.copy() + value[isna(value)] = self.dtype.na_value super().__setitem__(key, value) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7a92b7306beea..1bc944935756e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1754,6 +1754,13 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: except (ValueError, TypeError): return False + if dtype == "string": + try: + arr._maybe_convert_setitem_value(element) # type: ignore[union-attr] + return True + except (ValueError, TypeError): + return False + # This is technically incorrect, but maintains the behavior of # ExtensionBlock._can_hold_element return True diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 917a65348b7a3..7ee1361912c05 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -84,6 +84,7 @@ ABCNumpyExtensionArray, ABCSeries, ) +from pandas.core.dtypes.inference import is_re from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, @@ -115,6 +116,7 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation import expressions @@ -476,7 +478,9 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]: # Up/Down-casting @final - def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: + def coerce_to_target_dtype( + self, other, warn_on_upcast: bool = False, using_cow: bool = False + ) -> Block: """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise @@ -528,7 +532,14 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: f"{self.values.dtype}. Please report a bug at " "https://github.com/pandas-dev/pandas/issues." ) - return self.astype(new_dtype, copy=False) + copy = False + if ( + not using_cow + and isinstance(self.dtype, StringDtype) + and self.dtype.storage == "python" + ): + copy = True + return self.astype(new_dtype, copy=copy, using_cow=using_cow) @final def _maybe_downcast( @@ -879,7 +890,7 @@ def replace( else: return [self] if inplace else [self.copy()] - elif self._can_hold_element(value): + elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? blk = self._maybe_copy(using_cow, inplace) @@ -926,12 +937,13 @@ def replace( if value is None or value is NA: blk = self.astype(np.dtype(object)) else: - blk = self.coerce_to_target_dtype(value) + blk = self.coerce_to_target_dtype(value, using_cow=using_cow) return blk.replace( to_replace=to_replace, value=value, inplace=True, mask=mask, + using_cow=using_cow, ) else: @@ -980,16 +992,26 @@ def _replace_regex( ------- List[Block] """ - if not self._can_hold_element(to_replace): + if not is_re(to_replace) and not self._can_hold_element(to_replace): # i.e. only if self.is_object is True, but could in principle include a # String ExtensionBlock if using_cow: return [self.copy(deep=False)] return [self] if inplace else [self.copy()] - rx = re.compile(to_replace) + if is_re(to_replace) and self.dtype not in [object, "string"]: + # only object or string dtype can hold strings, and a regex object + # will only match strings + return [self.copy(deep=False)] - block = self._maybe_copy(using_cow, inplace) + if not ( + self._can_hold_element(value) or (self.dtype == "string" and is_re(value)) + ): + block = self.astype(np.dtype(object)) + else: + block = self._maybe_copy(using_cow, inplace) + + rx = re.compile(to_replace) replace_regex(block.values, rx, value, mask) @@ -1048,7 +1070,9 @@ def replace_list( # Exclude anything that we know we won't contain pairs = [ - (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) + (x, y) + for x, y in zip(src_list, dest_list) + if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x))) ] if not len(pairs): if using_cow: diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index ccee7ca24bd3d..8df9893e73766 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -297,7 +297,6 @@ def test_regex_replace_dict_nested_non_first_character( expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) @@ -556,7 +555,6 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -932,7 +930,6 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -961,7 +958,6 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_dict_no_regex(self): answer = Series( { @@ -985,7 +981,6 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_series_no_regex(self): answer = Series( { @@ -1104,7 +1099,6 @@ def test_replace_swapping_bug(self, using_infer_string): expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_period(self): d = { "fname": { @@ -1141,7 +1135,6 @@ def test_replace_period(self): result = df.replace(d) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_datetime(self): d = { "fname": { @@ -1367,7 +1360,6 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize( "replacer", [ @@ -1644,7 +1636,6 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index d95ee99489076..a1263e2d30853 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -886,24 +886,16 @@ def test_index_where(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): - with pytest.raises(TypeError, match="Invalid value"): - Index(obj, dtype=obj.dtype).where(~mask, val) - else: - res = Index(obj, dtype=obj.dtype).where(~mask, val) - expected_idx = Index(expected, dtype=expected.dtype) - tm.assert_index_equal(res, expected_idx) + res = Index(obj, dtype=obj.dtype).where(~mask, val) + expected_idx = Index(expected, dtype=expected.dtype) + tm.assert_index_equal(res, expected_idx) def test_index_putmask(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): - with pytest.raises(TypeError, match="Invalid value"): - Index(obj, dtype=obj.dtype).putmask(mask, val) - else: - res = Index(obj, dtype=obj.dtype).putmask(mask, val) - tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) + res = Index(obj, dtype=obj.dtype).putmask(mask, val) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 79a66526a0004..0b0cf57a70c3f 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -403,10 +403,6 @@ def test_replace_categorical(self, categorical, numeric, using_infer_string): ser = pd.Series(categorical) msg = "Downcasting behavior in `replace`" msg = "with CategoricalDtype is deprecated" - if using_infer_string: - with pytest.raises(TypeError, match="Invalid value"): - ser.replace({"A": 1, "B": 2}) - return with tm.assert_produces_warning(FutureWarning, match=msg): result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") @@ -745,13 +741,13 @@ def test_replace_regex_dtype_series(self, regex): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("regex", [False, True]) - def test_replace_regex_dtype_series_string(self, regex, using_infer_string): - if not using_infer_string: - # then this is object dtype which is already tested above - return + def test_replace_regex_dtype_series_string(self, regex): series = pd.Series(["0"], dtype="str") - with pytest.raises(TypeError, match="Invalid value"): - series.replace(to_replace="0", value=1, regex=regex) + expected = pd.Series([1], dtype="int64") + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = series.replace(to_replace="0", value=1, regex=regex) + tm.assert_series_equal(result, expected) def test_replace_different_int_types(self, any_int_numpy_dtype): # GH#45311 From c875a53fe39b0215c8b9593bd5bd7a8d60c9683c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2024 21:19:26 +0100 Subject: [PATCH 323/396] [backport 2.3.x] BUG (string dtype): let fillna with invalid value upcast to object dtype (#60296) (#60316) BUG (string dtype): let fillna with invalid value upcast to object dtype (#60296) * BUG (string dtype): let fillna with invalid value upcast to object dtype * fix fillna limit case + update tests for no longer raising (cherry picked from commit 34c39e9078ea8af12871a92bdcea2058553c9869) --- pandas/core/internals/blocks.py | 6 +++--- pandas/tests/frame/indexing/test_where.py | 8 +------- pandas/tests/series/indexing/test_setitem.py | 6 ------ 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7ee1361912c05..6ae591a5d4ac8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1710,7 +1710,7 @@ def fillna( return nbs if limit is not None: - mask[mask.cumsum(self.ndim - 1) > limit] = False + mask[mask.cumsum(self.values.ndim - 1) > limit] = False if inplace: nbs = self.putmask( @@ -2136,7 +2136,7 @@ def where( res_values = arr._where(cond, other).T except (ValueError, TypeError): if self.ndim == 1 or self.shape[0] == 1: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other) nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) @@ -2338,7 +2338,7 @@ def fillna( using_cow: bool = False, already_warned=None, ) -> list[Block]: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # Block.fillna handles coercion (test_fillna_interval) return super().fillna( value=value, diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 5fd3796d0255a..356257bbfec98 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -1086,15 +1086,9 @@ def test_where_producing_ea_cond_for_np_dtype(): @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement, using_infer_string): +def test_where_int_overflow(replacement): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) - if using_infer_string and replacement not in (None, "snake"): - with pytest.raises( - TypeError, match=f"Invalid value '{replacement}' for dtype 'str'" - ): - df.where(pd.notnull(df), replacement) - return result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index a1263e2d30853..85558e85494eb 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -34,7 +34,6 @@ concat, date_range, interval_range, - isna, period_range, timedelta_range, ) @@ -865,11 +864,6 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): obj = obj.copy() arr = obj._values - if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): - with pytest.raises(TypeError, match="Invalid value"): - obj.where(~mask, val) - return - res = obj.where(~mask, val) if val is NA and res.dtype == object: From 9537b2081187313d6a37169418df2bed468f2b9b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2024 21:19:51 +0100 Subject: [PATCH 324/396] [backport 2.3.x] TST (string dtype): resolve all easy xfails in pandas/tests/groupby (#60314) (#60317) TST (string dtype): resolve all easy xfails in pandas/tests/groupby (#60314) (cherry picked from commit c4a20261c337d68dc470fb6fd6a5505e2c7348d0) --- pandas/tests/groupby/aggregate/test_aggregate.py | 8 ++------ pandas/tests/groupby/aggregate/test_cython.py | 7 +++---- pandas/tests/groupby/aggregate/test_other.py | 6 ++---- pandas/tests/groupby/methods/test_quantile.py | 5 +---- pandas/tests/groupby/methods/test_size.py | 2 ++ pandas/tests/groupby/test_categorical.py | 9 +++++---- pandas/tests/groupby/test_groupby.py | 9 +++------ pandas/tests/groupby/test_groupby_dropna.py | 5 +---- pandas/tests/groupby/test_grouping.py | 10 ++++------ pandas/tests/groupby/test_pipe.py | 6 +----- pandas/tests/groupby/test_reductions.py | 7 ++----- pandas/tests/groupby/test_timegrouper.py | 2 ++ pandas/tests/groupby/transform/test_transform.py | 7 ++----- 13 files changed, 30 insertions(+), 53 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b267347aaf030..f02a828fe8d17 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError from pandas.core.dtypes.common import is_integer_dtype @@ -335,12 +333,11 @@ def aggfun_1(ser): assert len(result) == 0 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) def func(ser): - if ser.dtype == object: + if ser.dtype in (object, "string"): raise TypeError("Test error message") return ser.sum() @@ -1101,7 +1098,6 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_aggregate_mixed_types(): # GH 16916 df = DataFrame( @@ -1113,7 +1109,7 @@ def test_aggregate_mixed_types(): expected = DataFrame( expected_data, index=Index([2, "group 1"], dtype="object", name="grouping"), - columns=Index(["X", "Y", "Z"], dtype="object"), + columns=Index(["X", "Y", "Z"]), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 2990fb5949242..0d04af3801dbe 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -95,7 +93,6 @@ def test_cython_agg_boolean(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_agg_nothing_to_agg(): frame = DataFrame( {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} @@ -111,7 +108,9 @@ def test_cython_agg_nothing_to_agg(): result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) expected = DataFrame( - [], index=frame["a"].sort_values().drop_duplicates(), columns=[] + [], + index=frame["a"].sort_values().drop_duplicates(), + columns=Index([], dtype="str"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 5904b2f48359e..213704f31aca5 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError import pandas as pd @@ -308,7 +306,6 @@ def test_series_agg_multikey(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_series_agg_multi_pure_python(): data = DataFrame( { @@ -358,7 +355,8 @@ def test_series_agg_multi_pure_python(): ) def bad(x): - assert len(x.values.base) > 0 + if isinstance(x.values, np.ndarray): + assert len(x.values.base) > 0 return "foo" result = data.groupby(["A", "B"]).agg(bad) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 4269b41a0871b..3943590b069ad 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -170,11 +168,10 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - msg = "dtype 'object' does not support operation 'quantile'" + msg = "dtype '(object|str)' does not support operation 'quantile'" with pytest.raises(TypeError, match=msg): df.groupby("key").quantile() diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index fb834ee2a8799..271802c447024 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -108,6 +108,8 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) +# TODO(infer_string) in case the column is object dtype, it should preserve that dtype +# for the result's index @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_size_strings(any_string_dtype): # GH#55627 diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index cded7a71458fa..447df952fd0e5 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -340,8 +338,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_observed(observed): +def test_observed(request, using_infer_string, observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -349,6 +346,10 @@ def test_observed(observed): # gh-8138 (back-compat) # gh-8869 + if using_infer_string and not observed: + # TODO(infer_string) this fails with filling the string column with 0 + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3e2d15ede3648..9b362164c6149 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1617,7 +1617,6 @@ def test_groupby_two_group_keys_all_nan(): assert result == {} -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] @@ -1626,7 +1625,7 @@ def test_groupby_2d_malformed(): d["label"] = ["l1", "l2"] tmp = d.groupby(["group"]).mean(numeric_only=True) res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) - tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) + tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object)) tm.assert_numpy_array_equal(tmp.values, res_values) @@ -2711,7 +2710,6 @@ def test_groupby_all_nan_groups_drop(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_empty_multi_column(as_index, numeric_only): # GH 15106 & GH 41998 @@ -2720,7 +2718,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only): result = gb.sum(numeric_only=numeric_only) if as_index: index = MultiIndex([[], []], [[], []], names=["A", "B"]) - columns = ["C"] if not numeric_only else [] + columns = ["C"] if not numeric_only else Index([], dtype="str") else: index = RangeIndex(0) columns = ["A", "B", "C"] if not numeric_only else ["A", "B"] @@ -2728,7 +2726,6 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_aggregation_non_numeric_dtype(): # GH #43108 df = DataFrame( @@ -2739,7 +2736,7 @@ def test_groupby_aggregation_non_numeric_dtype(): { "v": [[1, 1], [10, 20]], }, - index=Index(["M", "W"], dtype="object", name="MW"), + index=Index(["M", "W"], name="MW"), ) gb = df.groupby(by=["MW"]) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 7e65e56abc4c9..2a9b61aa7ebf5 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -99,7 +97,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( tm.assert_frame_equal(grouped, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, idx, outputs", [ @@ -126,7 +123,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) grouped = df.groupby("a", dropna=dropna).sum() - expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a")) tm.assert_frame_equal(grouped, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 7c0a4b78a123d..9a0e67dea532b 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( CategoricalIndex, @@ -844,7 +842,6 @@ def test_groupby_empty(self): expected = ["name"] assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_level_index_value_all_na(self): # issue 20519 df = DataFrame( @@ -854,7 +851,7 @@ def test_groupby_level_index_value_all_na(self): expected = DataFrame( data=[], index=MultiIndex( - levels=[Index(["x"], dtype="object"), Index([], dtype="float64")], + levels=[Index(["x"], dtype="str"), Index([], dtype="float64")], codes=[[], []], names=["A", "B"], ), @@ -989,12 +986,13 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_single_column(self): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[]) + exp = DataFrame( + index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str") + ) tm.assert_frame_equal(df.groupby("a").count(), exp) tm.assert_frame_equal(df.groupby("a").sum(), exp) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 1044c83e3e56b..ee59a93695bcf 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -1,7 +1,4 @@ import numpy as np -import pytest - -from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -11,7 +8,6 @@ import pandas._testing as tm -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 @@ -39,7 +35,7 @@ def square(srs): # NDFrame.pipe methods result = df.groupby("A").pipe(f).pipe(square) - index = Index(["bar", "foo"], dtype="object", name="A") + index = Index(["bar", "foo"], name="A") expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 8e1bbcb43e3f3..599b0aabf85d5 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import iNaT from pandas.core.dtypes.common import pandas_dtype @@ -457,8 +455,7 @@ def test_max_min_non_numeric(): assert "ss" in result -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_max_min_object_multiple_columns(using_array_manager): +def test_max_min_object_multiple_columns(using_array_manager, using_infer_string): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with # DataFrame._reduce @@ -472,7 +469,7 @@ def test_max_min_object_multiple_columns(using_array_manager): ) df._consolidate_inplace() # should already be consolidate, but double-check if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 3 if using_infer_string else 2 gb = df.groupby("A") diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 92dfe146bbb54..3bae719e01b73 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -75,6 +75,8 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: + # TODO(infer_string) resample sum introduces 0's + # https://github.com/pandas-dev/pandas/issues/60229 @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_timegrouper(self): # GH 4161 diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index a516af7e15943..18ce6e93de402 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -2,8 +2,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.core.dtypes.common import ensure_platform_int @@ -1229,20 +1227,19 @@ def test_groupby_transform_with_datetimes(func, values): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_dtype(): # GH 22243 df = DataFrame({"a": [1], "val": [1.35]}) result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}")) - expected1 = Series(["+1.35"], name="val", dtype="object") + expected1 = Series(["+1.35"], name="val") tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}")) tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})")) - expected2 = Series(["+(1.35)"], name="val", dtype="object") + expected2 = Series(["+(1.35)"], name="val") tm.assert_series_equal(result, expected2) df["val"] = df["val"].astype(object) From aa8adfa6b7451cbc016743596312099403cb4fd4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2024 23:16:11 +0100 Subject: [PATCH 325/396] [backport 2.3.x] String dtype: enable in SQL IO + resolve all xfails (#60255) (#60315) (cherry picked from commit ba4d1cfdda14bf521ff91d6ad432b21095c417fd) Co-authored-by: Will Ayd --- pandas/_libs/lib.pyx | 8 +++++++- pandas/core/dtypes/cast.py | 2 ++ pandas/core/internals/construction.py | 5 +++-- pandas/io/sql.py | 21 +++++++++++++++++++-- pandas/tests/io/test_sql.py | 23 +++++++++++++---------- 5 files changed, 44 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c23f907aecfab..bc039917aef87 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2741,7 +2741,13 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_string_dtype() and is_string_array(objects, skipna=True): + if convert_to_nullable_dtype and is_string_array(objects, skipna=True): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype() + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + + elif using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(na_value=np.nan) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1bc944935756e..a1ef8a3e27e9c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1163,6 +1163,7 @@ def convert_dtypes( def maybe_infer_to_datetimelike( value: npt.NDArray[np.object_], + convert_to_nullable_dtype: bool = False, ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray: """ we might have a array (or single object) that is datetime like, @@ -1200,6 +1201,7 @@ def maybe_infer_to_datetimelike( # numpy would have done it for us. convert_numeric=False, convert_non_numeric=True, + convert_to_nullable_dtype=convert_to_nullable_dtype, dtype_if_all_nat=np.dtype("M8[ns]"), ) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 137648ee52bf7..64fac5fcfcdc2 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -1042,8 +1042,9 @@ def convert(arr): if dtype is None: if arr.dtype == np.dtype("O"): # i.e. maybe_convert_objects didn't convert - arr = maybe_infer_to_datetimelike(arr) - if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): + convert_to_nullable_dtype = dtype_backend != "numpy" + arr = maybe_infer_to_datetimelike(arr, convert_to_nullable_dtype) + if convert_to_nullable_dtype and arr.dtype == np.dtype("O"): new_dtype = StringDtype() arr_cls = new_dtype.construct_array_type() arr = arr_cls._from_sequence(arr, dtype=new_dtype) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 03ef1792f1fb8..07c95806d7326 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -46,6 +46,8 @@ from pandas.core.dtypes.common import ( is_dict_like, is_list_like, + is_object_dtype, + is_string_dtype, ) from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -59,6 +61,7 @@ Series, ) from pandas.core.arrays import ArrowExtensionArray +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.common import maybe_make_list @@ -1331,7 +1334,12 @@ def _harmonize_columns( elif dtype_backend == "numpy" and col_type is float: # floats support NA, can always convert! self.frame[col_name] = df_col.astype(col_type, copy=False) - + elif ( + using_string_dtype() + and is_string_dtype(col_type) + and is_object_dtype(self.frame[col_name]) + ): + self.frame[col_name] = df_col.astype(col_type, copy=False) elif dtype_backend == "numpy" and len(df_col) == df_col.count(): # No NA values, can convert ints and bools if col_type is np.dtype("int64") or col_type is bool: @@ -1418,6 +1426,7 @@ def _get_dtype(self, sqltype): DateTime, Float, Integer, + String, ) if isinstance(sqltype, Float): @@ -1437,6 +1446,10 @@ def _get_dtype(self, sqltype): return date elif isinstance(sqltype, Boolean): return bool + elif isinstance(sqltype, String): + if using_string_dtype(): + return StringDtype(na_value=np.nan) + return object @@ -2218,7 +2231,7 @@ def read_table( elif using_string_dtype(): from pandas.io._util import arrow_string_types_mapper - arrow_string_types_mapper() + mapping = arrow_string_types_mapper() else: mapping = None @@ -2299,6 +2312,10 @@ def read_query( from pandas.io._util import _arrow_dtype_mapping mapping = _arrow_dtype_mapping().get + elif using_string_dtype(): + from pandas.io._util import arrow_string_types_mapper + + mapping = arrow_string_types_mapper() else: mapping = None diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 514eaceaccbe6..29efe7a457ff8 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -63,7 +63,7 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), + pytest.mark.single_cpu, ] @@ -685,6 +685,7 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): @pytest.fixture def postgresql_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_postgresql") from adbc_driver_postgresql import dbapi @@ -817,6 +818,7 @@ def sqlite_conn_types(sqlite_engine_types): @pytest.fixture def sqlite_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_sqlite") from adbc_driver_sqlite import dbapi @@ -986,13 +988,13 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_empty(conn, test_frame1, request): - if conn == "postgresql_adbc_conn": + if conn == "postgresql_adbc_conn" and not using_string_dtype(): request.node.add_marker( pytest.mark.xfail( - reason="postgres ADBC driver cannot insert index with null type", - strict=True, + reason="postgres ADBC driver < 1.2 cannot insert index with null type", ) ) + # GH 51086 if conn is sqlite_engine conn = request.getfixturevalue(conn) empty_df = test_frame1.iloc[:0] @@ -3571,7 +3573,8 @@ def test_read_sql_dtype_backend( result = getattr(pd, func)( f"Select * from {table}", conn, dtype_backend=dtype_backend ) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3621,7 +3624,7 @@ def test_read_sql_dtype_backend_table( with pd.option_context("mode.string_storage", string_storage): result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -4150,7 +4153,7 @@ def tquery(query, con=None): def test_xsqlite_basic(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) assert sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 10 @@ -4177,7 +4180,7 @@ def test_xsqlite_basic(sqlite_buildin): def test_xsqlite_write_row_by_row(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) frame.iloc[0, 0] = np.nan @@ -4200,7 +4203,7 @@ def test_xsqlite_write_row_by_row(sqlite_buildin): def test_xsqlite_execute(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") @@ -4221,7 +4224,7 @@ def test_xsqlite_execute(sqlite_buildin): def test_xsqlite_schema(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") From fe1f4f9c5b8ccbab10d12a9d20e7060a003e59f4 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 15 Nov 2024 13:44:44 -0500 Subject: [PATCH 326/396] Backport PR #60318 on branch 2.3.x (TST (string dtype): resolve all xfails in JSON IO tests) (#60327) Backport PR #60318: TST (string dtype): resolve all xfails in JSON IO tests (cherry picked from commit 9bc88c79e6fd146a44970309bacc90490fdec590) --- pandas/tests/io/json/test_json_table_schema.py | 8 +------- pandas/tests/io/json/test_pandas.py | 14 ++++---------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 1e47b3bc38737..1c7320aa7a083 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -26,10 +24,6 @@ set_default_names, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture def df_schema(): @@ -126,7 +120,7 @@ def test_multiindex(self, df_schema, using_infer_string): expected["fields"][0] = { "name": "level_0", "type": "any", - "extDtype": "string", + "extDtype": "str", } expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a8608434be5ee..10f1e7df648f0 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -118,7 +118,7 @@ def datetime_frame(self): # since that doesn't round-trip, see GH#33711 df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=30, freq="B"), ) df.index = df.index._with_freq(None) @@ -203,7 +203,6 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): assert_json_roundtrip_equal(result, expected, orient) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): @@ -281,7 +280,6 @@ def test_roundtrip_empty(self, orient, convert_axes): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter @@ -709,7 +707,6 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = StringIO(object_series.to_json(orient=orient)) @@ -721,6 +718,9 @@ def test_series_roundtrip_object(self, orient, dtype, object_series): if orient != "split": expected.name = None + if using_string_dtype(): + expected = expected.astype("str") + tm.assert_series_equal(result, expected) def test_series_roundtrip_empty(self, orient): @@ -814,7 +814,6 @@ def test_path(self, float_frame, int_frame, datetime_frame): df.to_json(path) read_json(path) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_axis_dates(self, datetime_series, datetime_frame): # frame json = StringIO(datetime_frame.to_json()) @@ -827,7 +826,6 @@ def test_axis_dates(self, datetime_series, datetime_frame): tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dates(self, datetime_series, datetime_frame): # frame df = datetime_frame @@ -898,7 +896,6 @@ def test_convert_dates_infer(self, infer_word): result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "date,date_unit", [ @@ -959,7 +956,6 @@ def test_date_format_series_raises(self, datetime_series): with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_date_unit(self, unit, datetime_frame): df = datetime_frame @@ -1065,7 +1061,6 @@ def test_round_trip_exception(self, datapath): res = res.fillna(np.nan, downcast=False) tm.assert_frame_equal(res, df) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.network @pytest.mark.single_cpu @pytest.mark.parametrize( @@ -1474,7 +1469,6 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] ) From e37ffb3067c9032787dfb66836e11900e571ca32 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:46:35 -0800 Subject: [PATCH 327/396] Backport PR #60320 on branch 2.3.x (TST (string dtype): resolve xfails in common IO tests) (#60325) Backport PR #60320: TST (string dtype): resolve xfails in common IO tests Co-authored-by: Joris Van den Bossche --- pandas/tests/io/test_clipboard.py | 13 ++++++------ pandas/tests/io/test_common.py | 33 +++++++++++++---------------- pandas/tests/io/test_compression.py | 15 ++++++------- pandas/tests/io/test_gcs.py | 5 ++--- 4 files changed, 29 insertions(+), 37 deletions(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 3a52ff5acc0b3..a16c63e8d3d65 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -26,10 +24,6 @@ init_qt_clipboard, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def build_kwargs(sep, excel): kwargs = {} @@ -351,7 +345,7 @@ def test_raw_roundtrip(self, data): @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_dtype_backend( - self, clipboard, string_storage, dtype_backend, engine + self, clipboard, string_storage, dtype_backend, engine, using_infer_string ): # GH#50502 if dtype_backend == "pyarrow": @@ -396,6 +390,11 @@ def test_read_clipboard_dtype_backend( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) def test_invalid_dtype_backend(self): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index d38f716cf6a98..d1e42b297f143 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -154,7 +154,6 @@ def test_bytesiowrapper_returns_correct_bytes(self): assert result == data.encode("utf-8") # Test that pyarrow can handle a file opened with get_handle - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_handle_pyarrow_compat(self): pa_csv = pytest.importorskip("pyarrow.csv") @@ -169,6 +168,8 @@ def test_get_handle_pyarrow_compat(self): s = StringIO(data) with icom.get_handle(s, "rb", is_text=False) as handles: df = pa_csv.read_csv(handles.handle).to_pandas() + # TODO will have to update this when pyarrow' to_pandas() is fixed + expected = expected.astype("object") tm.assert_frame_equal(df, expected) assert not s.closed @@ -352,7 +353,6 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), ], ) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_write_fspath_all(self, writer_name, writer_kwargs, module): if writer_name in ["to_latex"]: # uses Styler implementation pytest.importorskip("jinja2") @@ -379,7 +379,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll @@ -450,14 +450,13 @@ def test_unknown_engine(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path) with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_binary_mode(self): """ 'encoding' shouldn't be passed to 'open' in binary mode. @@ -467,8 +466,8 @@ def test_binary_mode(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path, mode="w+b") tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) @@ -485,8 +484,8 @@ def test_warning_missing_utf_bom(self, encoding, compression_): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with tm.assert_produces_warning(UnicodeWarning): @@ -516,15 +515,14 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with codecs.open(path, mode="w", encoding=encoding) as handle: @@ -537,13 +535,12 @@ def test_codecs_encoding(encoding, format): tm.assert_frame_equal(expected, df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_codecs_get_writer_reader(): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with open(path, "wb") as handle: @@ -568,8 +565,8 @@ def test_explicit_encoding(io_class, mode, msg): # wrong mode is requested expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with io_class() as buffer: with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 25504c7b88fdb..af89f0916355e 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -12,8 +12,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas as pd @@ -139,7 +137,6 @@ def test_compression_warning(compression_only): df.to_csv(handles.handle, compression=compression_only) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_compression_binary(compression_only): """ Binary file handles support compression. @@ -148,8 +145,8 @@ def test_compression_binary(compression_only): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) # with a file @@ -180,8 +177,8 @@ def test_gzip_reproducibility_file_name(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} @@ -203,8 +200,8 @@ def test_gzip_reproducibility_file_object(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 81f951b3958b0..c7671bfb513aa 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -147,7 +147,6 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) def test_to_csv_compression_encoding_gcs( gcs_buffer, compression_only, encoding, compression_to_extension @@ -160,8 +159,8 @@ def test_to_csv_compression_encoding_gcs( """ df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # reference of compressed and encoded file From 4f13697fb71085a1f6b4ffe12a270ff9d423609a Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 15 Nov 2024 17:10:56 -0500 Subject: [PATCH 328/396] Backport PR #60312 on branch 2.3.x (TST (string dtype): resolve xfails in pandas/tests/apply + raise TypeError for ArrowArray accumulate) (#60328) * Backport PR #60312 on branch 2.3.x (TST (string dtype): resolve xfails in pandas/tests/apply + raise TypeError for ArrowArray accumulate) (cherry picked from commit fba5f08f048215a6e0a578f8bad7b7f2c9ee8eef) * 2.3 test function compat --------- Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/arrow/array.py | 6 +++++- pandas/tests/apply/test_invalid_arg.py | 30 +++++++++----------------- pandas/tests/apply/test_str.py | 13 ++++++----- pandas/tests/extension/test_arrow.py | 2 +- 4 files changed, 24 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 13e10c8d3a738..0c1e1d0c63c85 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1633,7 +1633,11 @@ def _accumulate( else: data_to_accum = data_to_accum.cast(pa.int64()) - result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + try: + result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + except pa.ArrowNotImplementedError as err: + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) from err if convert_to_int: result = result.cast(pa_dtype) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 8963265b0a800..68f3fe36546a0 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -218,18 +218,12 @@ def transform(row): def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 if using_infer_string: - if df.dtypes.iloc[0].storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) + expected = (expected, NotImplementedError) msg = ( - "can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform" + "can't multiply sequence by non-int of type 'str'" + "|cannot perform cumprod with type str" # NotImplementedError python backend + "|operation 'cumprod' not supported for dtype 'str'" # TypeError pyarrow ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): @@ -259,16 +253,12 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri if func == "median" or func is np.nanmedian or func is np.median: msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" - if using_infer_string: - if series.dtype.storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) - msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform" + if using_infer_string and func in ("cumprod", np.cumprod, np.nancumprod): + expected = (expected, NotImplementedError) + + msg = ( + msg + "|does not support|has no kernel|Cannot perform|cannot perform|operation" + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 8956aed5e9ceb..f916567c6b883 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_number from pandas import ( @@ -88,7 +86,6 @@ def test_apply_np_transformer(float_frame, op, how): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -147,7 +144,6 @@ def test_agg_cython_table_series(series, func, expected): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -170,10 +166,17 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(series, func, expected): +def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if series.dtype == "string" and func in ("cumsum", np.cumsum, np.nancumsum): + request.applymarker( + pytest.mark.xfail( + raises=(TypeError, NotImplementedError), + reason="TODO(infer_string) cumsum not yet implemented for string", + ) + ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0ce7a66e0e00c..03ab7c7f1dad8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -436,7 +436,7 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques request.applymarker( pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for {pa_type}", - raises=NotImplementedError, + raises=TypeError, ) ) From 38565aa88ea27617f71ea754b213088772e3fad7 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 16 Nov 2024 04:34:42 -0500 Subject: [PATCH 329/396] Backport PR #60324: REF: centralize pyarrow Table to pandas conversions and types_mapper handling (#60332) (cherry picked from commit 12d6f602eea98275553ac456f90201151b1f9bf8) Co-authored-by: Joris Van den Bossche --- pandas/io/_util.py | 49 ++++++++++++++++++++++- pandas/io/feather_format.py | 17 +------- pandas/io/json/_json.py | 15 +------ pandas/io/orc.py | 21 +--------- pandas/io/parquet.py | 34 ++++++++-------- pandas/io/parsers/arrow_parser_wrapper.py | 33 ++++++--------- pandas/io/sql.py | 41 ++++--------------- pandas/tests/io/test_sql.py | 4 +- 8 files changed, 92 insertions(+), 122 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index f3e6dba1391be..9373888e28d28 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,14 +1,27 @@ from __future__ import annotations -from typing import Callable +from typing import ( + TYPE_CHECKING, + Literal, +) import numpy as np +from pandas._config import using_string_dtype + +from pandas._libs import lib from pandas.compat import pa_version_under18p0 from pandas.compat._optional import import_optional_dependency import pandas as pd +if TYPE_CHECKING: + from collections.abc import Callable + + import pyarrow + + from pandas._typing import DtypeBackend + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") @@ -30,7 +43,7 @@ def _arrow_dtype_mapping() -> dict: } -def arrow_string_types_mapper() -> Callable: +def _arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") mapping = { @@ -41,3 +54,35 @@ def arrow_string_types_mapper() -> Callable: mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) return mapping.get + + +def arrow_table_to_pandas( + table: pyarrow.Table, + dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, + null_to_int64: bool = False, + to_pandas_kwargs: dict | None = None, +) -> pd.DataFrame: + if to_pandas_kwargs is None: + to_pandas_kwargs = {} + + pa = import_optional_dependency("pyarrow") + + types_mapper: type[pd.ArrowDtype] | None | Callable + if dtype_backend == "numpy_nullable": + mapping = _arrow_dtype_mapping() + if null_to_int64: + # Modify the default mapping to also map null to Int64 + # (to match other engines - only for CSV parser) + mapping[pa.null()] = pd.Int64Dtype() + types_mapper = mapping.get + elif dtype_backend == "pyarrow": + types_mapper = pd.ArrowDtype + elif using_string_dtype(): + types_mapper = _arrow_string_types_mapper() + elif dtype_backend is lib.no_default or dtype_backend == "numpy": + types_mapper = None + else: + raise NotImplementedError + + df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) + return df diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 68c73483add3f..1bdb732cb10de 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -13,11 +13,10 @@ from pandas.util._decorators import doc from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import get_handle if TYPE_CHECKING: @@ -128,16 +127,4 @@ def read_feather( pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) ) - - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) - - elif dtype_backend == "pyarrow": - return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - - elif using_string_dtype(): - return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - raise NotImplementedError + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 9414f45215029..c0499ce750cf0 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -40,7 +40,6 @@ from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( - ArrowDtype, DataFrame, Index, MultiIndex, @@ -52,6 +51,7 @@ from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, dedup_names, @@ -997,18 +997,7 @@ def read(self) -> DataFrame | Series: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) - - mapping: type[ArrowDtype] | None | Callable - if self.dtype_backend == "pyarrow": - mapping = ArrowDtype - elif self.dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - else: - mapping = None - - return pa_table.to_pandas(types_mapper=mapping) + return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) elif self.engine == "ujson": if self.lines: if self.chunksize: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 5706336b71697..d7f473a929568 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,16 +9,13 @@ Literal, ) -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.indexes.api import default_index -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( get_handle, is_fsspec_url, @@ -117,21 +114,7 @@ def read_orc( pa_table = orc.read_table( source=source, columns=columns, filesystem=filesystem, **kwargs ) - if dtype_backend is not lib.no_default: - if dtype_backend == "pyarrow": - df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) - else: - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - df = pa_table.to_pandas(types_mapper=mapping.get) - return df - else: - if using_string_dtype(): - types_mapper = arrow_string_types_mapper() - else: - types_mapper = None - return pa_table.to_pandas(types_mapper=types_mapper) + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) def to_orc( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index cc33c87dfc55a..01e320cdb1b72 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -10,9 +10,11 @@ Literal, ) import warnings -from warnings import catch_warnings +from warnings import ( + catch_warnings, + filterwarnings, +) -from pandas._config import using_string_dtype from pandas._config.config import _get_option from pandas._libs import lib @@ -22,14 +24,13 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas import ( DataFrame, get_option, ) from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, get_handle, @@ -250,20 +251,10 @@ def read( kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - to_pandas_kwargs["types_mapper"] = mapping.get - elif dtype_backend == "pyarrow": - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_string_dtype(): - to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() manager = _get_option("mode.data_manager", silent=True) if manager == "array": - to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] - + to_pandas_kwargs["split_blocks"] = True path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, @@ -278,7 +269,18 @@ def read( filters=filters, **kwargs, ) - result = pa_table.to_pandas(**to_pandas_kwargs) + + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + result = arrow_table_to_pandas( + pa_table, + dtype_backend=dtype_backend, + to_pandas_kwargs=to_pandas_kwargs, + ) if manager == "array": result = result._as_manager("array", copy=False) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index c774638fd73f7..a7f01e6322755 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,8 +3,6 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -16,18 +14,14 @@ from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.inference import is_integer -import pandas as pd -from pandas import DataFrame - -from pandas.io._util import ( - _arrow_dtype_mapping, - arrow_string_types_mapper, -) +from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: from pandas._typing import ReadBuffer + from pandas import DataFrame + class ArrowParserWrapper(ParserBase): """ @@ -287,17 +281,14 @@ def read(self) -> DataFrame: table = table.cast(new_schema) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + frame = arrow_table_to_pandas( + table, dtype_backend=dtype_backend, null_to_int64=True + ) - else: - frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 07c95806d7326..7027702a696fe 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -49,10 +49,7 @@ is_object_dtype, is_string_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - DatetimeTZDtype, -) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas import get_option @@ -68,6 +65,8 @@ from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime +from pandas.io._util import arrow_table_to_pandas + if TYPE_CHECKING: from collections.abc import ( Iterator, @@ -2221,23 +2220,10 @@ def read_table( else: stmt = f"SELECT {select_list} FROM {table_name}" - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - mapping = arrow_string_types_mapper() - else: - mapping = None - with self.con.cursor() as cur: cur.execute(stmt) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, @@ -2305,23 +2291,10 @@ def read_query( if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - mapping = arrow_string_types_mapper() - else: - mapping = None - with self.con.cursor() as cur: cur.execute(sql) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 29efe7a457ff8..3676721c5e6b7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -959,12 +959,12 @@ def sqlite_buildin_types(sqlite_buildin, types_data): adbc_connectable_iris = [ pytest.param("postgresql_adbc_iris", marks=pytest.mark.db), - pytest.param("sqlite_adbc_iris", marks=pytest.mark.db), + "sqlite_adbc_iris", ] adbc_connectable_types = [ pytest.param("postgresql_adbc_types", marks=pytest.mark.db), - pytest.param("sqlite_adbc_types", marks=pytest.mark.db), + "sqlite_adbc_types", ] From 7958d6c73402ebf995c337fba7cfeadb18ec79e9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 17 Nov 2024 00:09:02 -0800 Subject: [PATCH 330/396] Backport PR #56013 on branch 2.3.x (BUG: get_indexer rountripping through string dtype) (#60339) Backport PR #56013: BUG: get_indexer rountripping through string dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/indexes/base.py | 11 ++++++++++- pandas/tests/indexes/object/test_indexing.py | 9 +++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 3e699e1a27b55..473d67acf6e74 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -119,7 +119,7 @@ Interval Indexing ^^^^^^^^ -- +- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) - Missing diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5da327a82c02b..4896fb0ad1cd2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6695,7 +6695,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index: """ Analogue to maybe_cast_indexer for get_indexer instead of get_loc. """ - return ensure_index(target) + target_index = ensure_index(target) + if ( + not hasattr(target, "dtype") + and self.dtype == object + and target_index.dtype == "string" + ): + # If we started with a list-like, avoid inference to string dtype if self + # is object dtype (coercing to string dtype will alter the missing values) + target_index = Index(target, dtype=self.dtype) + return target_index @final def _validate_indexer( diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index d3df349027c00..42ef7e7a96f5e 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -62,6 +62,15 @@ def test_get_indexer_with_NA_values( expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_infer_string_missing_values(self): + # ensure the passed list is not cast to string but to object so that + # the None value is matched in the index + # https://github.com/pandas-dev/pandas/issues/55834 + idx = Index(["a", "b", None], dtype="object") + result = idx.get_indexer([None, "x"]) + expected = np.array([2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestGetIndexerNonUnique: def test_get_indexer_non_unique_nas(self, nulls_fixture): From 2cf68693d8f9666b33d64702e5bb731a0734b12e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 17 Nov 2024 04:16:53 -0800 Subject: [PATCH 331/396] Backport PR #60333 on branch 2.3.x (BUG (string dtype): fix handling of string dtype in interchange protocol) (#60347) Backport PR #60333: BUG (string dtype): fix handling of string dtype in interchange protocol Co-authored-by: William Ayd --- pandas/core/interchange/from_dataframe.py | 12 ++++++++---- pandas/tests/interchange/test_impl.py | 9 ++------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 4162ebc33f0d6..53f18883ea3ad 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -6,6 +6,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas.compat._optional import import_optional_dependency from pandas.errors import SettingWithCopyError @@ -124,8 +126,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: ------- pd.DataFrame """ - # We need a dict of columns here, with each column being a NumPy array (at - # least for now, deal with non-NumPy dtypes later). columns: dict[str, Any] = {} buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): @@ -324,8 +324,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # Add to our list of strings str_list[i] = string - # Convert the string list to a NumPy array - return np.asarray(str_list, dtype="object"), buffers + if using_string_dtype(): + res = pd.Series(str_list, dtype="str") + else: + res = np.asarray(str_list, dtype="object") # type: ignore[assignment] + + return res, buffers # type: ignore[return-value] def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray: diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index ef94c4c7aff2c..c32b31c297c5d 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import iNaT from pandas.compat import ( is_ci_environment, @@ -412,7 +410,6 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) @@ -421,13 +418,12 @@ def test_empty_string_column(): tm.assert_frame_equal(df, result) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") result = pd.api.interchange.from_dataframe(df.__dataframe__()) - expected = pd.DataFrame({"a": ["x"]}, dtype="object") + expected = pd.DataFrame({"a": ["x"]}, dtype="str") tm.assert_frame_equal(result, expected) @@ -438,7 +434,6 @@ def test_non_str_names(): assert names == ["0"] -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_non_str_names_w_duplicates(): # https://github.com/pandas-dev/pandas/issues/56701 df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) @@ -449,7 +444,7 @@ def test_non_str_names_w_duplicates(): "Expected a Series, got a DataFrame. This likely happened because you " "called __dataframe__ on a DataFrame which, after converting column " r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " - r"dtype='object'\). Please rename these columns before using the " + r"dtype='(str|object)'\). Please rename these columns before using the " "interchange protocol." ), ): From 0bcd25088b38f3c97417e0c3672b4cb5ba7291e9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 17 Nov 2024 17:04:18 +0100 Subject: [PATCH 332/396] [backport 2.3.x] CI: update fastparquet xfails for new release (#60337) (#60344) --- pandas/tests/io/test_parquet.py | 37 +++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index e43aae6a2e9e7..87f9b0108402c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1223,11 +1223,17 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) - @pytest.mark.xfail( - Version(np.__version__) >= Version("2.0.0"), - reason="fastparquet uses np.float_ in numpy2", - ) - def test_bool_with_none(self, fp): + def test_bool_with_none(self, fp, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0") and Version( + np.__version__ + ) >= Version("2.0.0"): + request.applymarker( + pytest.mark.xfail( + reason=("fastparquet uses np.float_ in numpy2"), + ) + ) df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") # Fastparquet bug in 0.7.1 makes it so that this dtype becomes @@ -1342,12 +1348,21 @@ def test_empty_dataframe(self, fp): expected = df.copy() check_round_trip(df, fp, expected=expected) - @pytest.mark.xfail( - _HAVE_FASTPARQUET and Version(fastparquet.__version__) > Version("2022.12"), - reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929", - ) - @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") - def test_timezone_aware_index(self, fp, timezone_aware_date_list): + def test_timezone_aware_index(self, fp, timezone_aware_date_list, request): + import fastparquet + + if Version(fastparquet.__version__) > Version("2022.12") and Version( + fastparquet.__version__ + ) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=( + "fastparquet bug, see " + "https://github.com/dask/fastparquet/issues/929" + ), + ) + ) + idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) From 112c2e910d9c9f96ad3c0c5c66bc3928fc687a9e Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 18 Nov 2024 18:33:33 -0500 Subject: [PATCH 333/396] =?UTF-8?q?Backport=20PR=20#60321:=20TST=20(string?= =?UTF-8?q?=20dtype):=20resolve=20all=20xfails=20in=20IO=20pars=E2=80=A6?= =?UTF-8?q?=20(#60330)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Backport PR #60321: TST (string dtype): resolve all xfails in IO parser tests (cherry picked from commit ee3c18f51b393893ed6e31214c7be2f9427ce0c9) * BUG: Avoid RangeIndex conversion in read_csv if dtype is specified (#59316) Co-authored-by: Joris Van den Bossche Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/io/parsers/base_parser.py | 36 +++++++++++++------ .../tests/io/parser/common/test_chunksize.py | 13 +++---- .../io/parser/common/test_file_buffer_url.py | 7 ++-- pandas/tests/io/parser/common/test_index.py | 10 +++--- .../io/parser/dtypes/test_dtypes_basic.py | 22 +++++++++--- pandas/tests/io/parser/test_c_parser_only.py | 13 +++---- pandas/tests/io/parser/test_converters.py | 5 +-- pandas/tests/io/parser/test_index_col.py | 5 +-- pandas/tests/io/parser/test_mangle_dupes.py | 10 +++--- pandas/tests/io/parser/test_na_values.py | 25 +++++++------ pandas/tests/io/parser/test_parse_dates.py | 11 ++---- pandas/tests/io/parser/test_upcast.py | 3 -- 12 files changed, 89 insertions(+), 71 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 09f0f2af8e5c6..40e3ea6450647 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -464,7 +464,11 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arrays = [] converters = self._clean_mapping(self.converters) - for i, arr in enumerate(index): + if self.index_names is not None: + names: Iterable = self.index_names + else: + names = itertools.cycle([None]) + for i, (arr, name) in enumerate(zip(index, names)): if try_parse_dates and self._should_parse_dates(i): arr = self._date_conv( arr, @@ -504,12 +508,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arr, _ = self._infer_types( arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool ) - arrays.append(arr) - - names = self.index_names - index = ensure_index_from_sequences(arrays, names) + if cast_type is not None: + # Don't perform RangeIndex inference + idx = Index(arr, name=name, dtype=cast_type) + else: + idx = ensure_index_from_sequences([arr], [name]) + arrays.append(idx) - return index + if len(arrays) == 1: + return arrays[0] + else: + return MultiIndex.from_arrays(arrays) @final def _convert_to_ndarrays( @@ -1084,12 +1093,11 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): dtype_dict: defaultdict[Hashable, Any] if not is_dict_like(dtype): # if dtype == None, default will be object. - default_dtype = dtype or object - dtype_dict = defaultdict(lambda: default_dtype) + dtype_dict = defaultdict(lambda: dtype) else: dtype = cast(dict, dtype) dtype_dict = defaultdict( - lambda: object, + lambda: None, {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, ) @@ -1106,8 +1114,14 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): if (index_col is None or index_col is False) or index_names is None: index = default_index(0) else: - data = [Series([], dtype=dtype_dict[name]) for name in index_names] - index = ensure_index_from_sequences(data, names=index_names) + # TODO: We could return default_index(0) if dtype_dict[name] is None + data = [ + Index([], name=name, dtype=dtype_dict[name]) for name in index_names + ] + if len(data) == 1: + index = data[0] + else: + index = MultiIndex.from_arrays(data) index_col.sort() for i, n in enumerate(index_col): diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 7b70601addcad..5226476ef6eac 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import parsers as libparsers from pandas.errors import DtypeWarning @@ -230,8 +228,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_warn_if_chunks_have_mismatched_type(all_parsers): +def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): warning_type = None parser = all_parsers size = 10000 @@ -259,8 +256,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): "Specify dtype option on import or set low_memory=False.", buf, ) - - assert df.a.dtype == object + if parser.engine == "c" and parser.low_memory: + assert df.a.dtype == object + elif using_infer_string: + assert df.a.dtype == "str" + else: + assert df.a.dtype == object @pytest.mark.parametrize("iterator", [True, False]) diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index c13b77f365496..d573b47bb3279 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -14,8 +14,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( EmptyDataError, ParserError, @@ -69,14 +67,13 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 0121af53f1aa4..cdd65223a9c9f 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -8,8 +8,6 @@ import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -87,9 +85,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) -def test_multi_index_no_level_names(all_parsers, index_col): +def test_multi_index_no_level_names( + request, all_parsers, index_col, using_infer_string +): + if using_infer_string and all_parsers.engine == "pyarrow": + # result should have string columns instead of object dtype + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 787941c5d0376..d28c43c45647a 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserWarning import pandas as pd @@ -24,6 +22,8 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @@ -54,7 +54,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_per_column(all_parsers): parser = all_parsers @@ -68,7 +67,6 @@ def test_dtype_per_column(all_parsers): [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] ) expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) tm.assert_frame_equal(result, expected) @@ -598,6 +596,7 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL @@ -608,7 +607,7 @@ def test_accurate_parsing_of_large_integers(all_parsers): AMZN,20230301181139587,2023552585717889759,2023552585717263360 MSFT,20230301181139587,2023552585717889863,2023552585717263361 NVDA,20230301181139587,2023552585717889827,2023552585717263361""" - orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 @@ -630,3 +629,16 @@ def test_dtypes_with_usecols(all_parsers): values = ["1", "4"] expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) tm.assert_frame_equal(result, expected) + + +def test_index_col_with_dtype_no_rangeindex(all_parsers): + data = StringIO("345.5,519.5,0\n519.5,726.5,1") + result = all_parsers.read_csv( + data, + header=None, + names=["start", "stop", "bin_id"], + dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32}, + index_col="bin_id", + ).index + expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id") + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 1501479510e17..5b72f76440349 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,8 +17,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, @@ -185,8 +183,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_usecols_dtypes(c_parser_only): +def test_usecols_dtypes(c_parser_only, using_infer_string): parser = c_parser_only data = """\ 1,2,3 @@ -211,8 +208,12 @@ def test_usecols_dtypes(c_parser_only): dtype={"b": int, "c": float}, ) - assert (result.dtypes == [object, int, float]).all() - assert (result2.dtypes == [object, float]).all() + if using_infer_string: + assert (result.dtypes == ["string", int, float]).all() + assert (result2.dtypes == ["string", float]).all() + else: + assert (result.dtypes == [object, int, float]).all() + assert (result2.dtypes == [object, float]).all() def test_disable_bool_parsing(c_parser_only): diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index a3c6dc8fd0898..1848e1e571fc1 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -186,7 +184,6 @@ def convert_score(x): tm.assert_frame_equal(results[0], results[1]) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("conv_f", [lambda x: x, str]) def test_converter_index_col_bug(all_parsers, conv_f): # see gh-1835 , GH#40589 @@ -205,7 +202,7 @@ def test_converter_index_col_bug(all_parsers, conv_f): StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) - xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object")) + xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A")) tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 6dbfed2b6ae83..9224b743b8917 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -344,7 +342,6 @@ def test_infer_types_boolean_sum(all_parsers): tm.assert_frame_equal(result, expected, check_index_type=False) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): # GH#9435 @@ -355,7 +352,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine") ) result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) - expected = DataFrame({"b": [2]}, index=Index([val], name="a")) + expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 32a8d3b81f470..80c32d3a6262e 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -7,9 +7,10 @@ import pytest -from pandas._config import using_string_dtype - -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -120,7 +121,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 @@ -132,7 +132,7 @@ def test_mangled_unnamed_placeholders(all_parsers): # This test recursively updates `df`. for i in range(3): - expected = DataFrame() + expected = DataFrame(columns=Index([], dtype="str")) for j in range(i + 1): col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 5f9823f7225f9..dd168aaa45808 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import STR_NA_VALUES from pandas import ( @@ -260,7 +258,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "kwargs,expected", [ @@ -306,7 +303,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected, request): +def test_na_values_keep_default( + all_parsers, kwargs, expected, request, using_infer_string +): data = """\ A,B,C a,1,one @@ -324,8 +323,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) return - mark = pytest.mark.xfail() - request.applymarker(mark) + if not using_infer_string or "na_values" in kwargs: + mark = pytest.mark.xfail() + request.applymarker(mark) result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) @@ -435,8 +435,6 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", [ @@ -444,14 +442,21 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), ], ) -def test_na_values_na_filter_override(all_parsers, na_filter, row_data): +def test_na_values_na_filter_override( + request, all_parsers, na_filter, row_data, using_infer_string +): + parser = all_parsers + if parser.engine == "pyarrow": + # mismatched dtypes in both cases, FutureWarning in the True case + if not (using_infer_string and na_filter): + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.applymarker(mark) data = """\ A,B 1,A nan,B 3,C """ - parser = all_parsers result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) expected = DataFrame(row_data, columns=["A", "B"]) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index be2015fca27d1..616fcb81cf055 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -16,8 +16,6 @@ import pytest import pytz -from pandas._config import using_string_dtype - from pandas._libs.tslibs import parsing import pandas as pd @@ -1799,7 +1797,6 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", @@ -1807,7 +1804,7 @@ def test_parse_timezone(all_parsers): ) def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers - expected = DataFrame({0: [date_string]}, dtype="object") + expected = DataFrame({0: [date_string]}, dtype="str") result = parser.read_csv( StringIO(date_string), header=None, @@ -2054,7 +2051,6 @@ def test_parse_dates_and_keep_original_column(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dayfirst_warnings(): # GH 12585 @@ -2087,7 +2083,7 @@ def test_dayfirst_warnings(): # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") + expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date") # A. use dayfirst=True res5 = read_csv( @@ -2204,7 +2200,6 @@ def test_parse_dates_and_string_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_parse_dot_separated_dates(all_parsers): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers @@ -2214,7 +2209,7 @@ def test_parse_dot_separated_dates(all_parsers): if parser.engine == "pyarrow": expected_index = Index( ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], - dtype="object", + dtype="str", name="a", ) warn = None diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 01e576ba40f26..bc4c4c2e24e9c 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import ( _maybe_upcast, na_values, @@ -86,7 +84,6 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 From 3bcbf0c02c2ba79e577b5e8a3252a1816895e239 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Nov 2024 15:59:09 -0800 Subject: [PATCH 334/396] Backport PR #60416 on branch 2.3.x (TST: Avoid hashing np.timedelta64 without unit) (#60418) Backport PR #60416: TST: Avoid hashing np.timedelta64 without unit --- pandas/tests/test_algos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 97d6415e0de05..d1e69cfa2b4ee 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1283,7 +1283,7 @@ def test_value_counts_nat(self): result_dt = algos.value_counts(dt) tm.assert_series_equal(result_dt, exp_dt) - exp_td = Series({np.timedelta64(10000): 1}, name="count") + exp_td = Series([1], index=[np.timedelta64(10000)], name="count") with tm.assert_produces_warning(FutureWarning, match=msg): result_td = algos.value_counts(td) tm.assert_series_equal(result_td, exp_td) From 2b37c980553064dd6d0df7f7d3d5335444c1c9c4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Dec 2024 20:01:43 +0100 Subject: [PATCH 335/396] [backport 2.3.x] String dtype: use ObjectEngine for indexing for now correctness over performance (#60329) (#60453) String dtype: use ObjectEngine for indexing for now correctness over performance (#60329) (cherry picked from commit 98f7e4deeff26a5ef993ee27104387a1a6e0d3d3) --- pandas/_libs/index.pyi | 3 + pandas/_libs/index.pyx | 26 +++++ pandas/core/indexes/base.py | 3 +- pandas/tests/indexes/string/test_indexing.py | 104 +++++++++++++++++-- 4 files changed, 124 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 75db47bf3160e..9c3791a642768 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -68,6 +68,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ... class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... +class StringObjectEngine(ObjectEngine): + def __init__(self, values: object, na_value) -> None: ... + class BaseMultiIndexCodesEngine: levels: list[np.ndarray] offsets: np.ndarray # ndarray[uint64_t, ndim=1] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ee6a11ddab004..365cc7c3cecfc 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -532,6 +532,32 @@ cdef class ObjectEngine(IndexEngine): return loc +cdef class StringObjectEngine(ObjectEngine): + + cdef: + object na_value + bint uses_na + + def __init__(self, ndarray values, na_value): + super().__init__(values) + self.na_value = na_value + self.uses_na = na_value is C_NA + + cdef bint _checknull(self, object val): + if self.uses_na: + return val is C_NA + else: + return util.is_nan(val) + + cdef _check_type(self, object val): + if isinstance(val, str): + return val + elif self._checknull(val): + return self.na_value + else: + raise KeyError(val) + + cdef class DatetimeEngine(Int64Engine): cdef: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4896fb0ad1cd2..ad39907e7400e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -884,6 +884,8 @@ def _engine( # error: Item "ExtensionArray" of "Union[ExtensionArray, # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] + elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr] # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" @@ -6133,7 +6135,6 @@ def _should_fallback_to_positional(self) -> bool: def get_indexer_non_unique( self, target ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - target = ensure_index(target) target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not self._should_partial_index(target): diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index 755b7109a5a04..d1a278af337b7 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -6,6 +6,51 @@ import pandas._testing as tm +def _isnan(val): + try: + return val is not pd.NA and np.isnan(val) + except TypeError: + return False + + +class TestGetLoc: + def test_get_loc(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + assert index.get_loc("b") == 1 + + def test_get_loc_raises(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="d"): + index.get_loc("d") + + def test_get_loc_invalid_value(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="1"): + index.get_loc(1) + + def test_get_loc_non_unique(self, any_string_dtype): + index = Index(["a", "b", "a"], dtype=any_string_dtype) + result = index.get_loc("a") + expected = np.array([True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + + def test_get_loc_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) + if any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture)) + ): + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + else: + assert index.get_loc(nulls_fixture) == 2 + + class TestGetIndexer: @pytest.mark.parametrize( "method,expected", @@ -41,23 +86,60 @@ def test_get_indexer_strings_raises(self, any_string_dtype): ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] ) + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): + # NaT and Decimal("NaN") from null_fixture are not supported for string dtype + index = Index(["a", "b", null], dtype=any_string_dtype) + result = index.get_indexer(["a", null, "c"]) + if using_infer_string: + expected = np.array([0, 2, -1], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected = np.array([0, -1, -1], dtype=np.intp) + else: + expected = np.array([0, 2, -1], dtype=np.intp) -class TestGetIndexerNonUnique: - @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) - def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): - index = Index(["a", "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + tm.assert_numpy_array_equal(result, expected) - expected_indexer = np.array([2], dtype=np.intp) - expected_missing = np.array([], dtype=np.intp) + +class TestGetIndexerNonUnique: + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_non_unique_nas( + self, any_string_dtype, null, using_infer_string + ): + index = Index(["a", "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected_indexer = np.array([0, -1], dtype=np.intp) + expected_missing = np.array([1], dtype=np.intp) + else: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", None, "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) - - expected_indexer = np.array([1, 3], dtype=np.intp) + index = Index(["a", null, "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + pass + else: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) From 4c2d6b4db2ca76bd5da9d04378eafed6dfaed24b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 3 Dec 2024 12:30:28 -0800 Subject: [PATCH 336/396] Backport PR #60461 on branch 2.3.x (PERF: improve construct_1d_object_array_from_listlike) (#60483) Backport PR #60461: PERF: improve construct_1d_object_array_from_listlike Co-authored-by: Joris Van den Bossche --- pandas/core/dtypes/cast.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a1ef8a3e27e9c..f6fcd887f4528 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -87,8 +87,8 @@ if TYPE_CHECKING: from collections.abc import ( + Collection, Sequence, - Sized, ) from pandas._typing import ( @@ -1586,7 +1586,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj): return _maybe_unbox_datetimelike(value, dtype) -def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: +def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1604,11 +1604,9 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: ------- 1-dimensional numpy array of dtype object """ - # numpy will try to interpret nested lists as further dimensions, hence - # making a 1D array that contains list-likes is a bit tricky: - result = np.empty(len(values), dtype="object") - result[:] = values - return result + # numpy will try to interpret nested lists as further dimensions in np.array(), + # hence explicitly making a 1D array using np.fromiter + return np.fromiter(values, dtype="object", count=len(values)) def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray: From 7f7e3055ecd85686d5c35f9efb98f394bf9a54d8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 11 Dec 2024 14:13:14 -0800 Subject: [PATCH 337/396] Backport PR #60544 on branch 2.3.x (CI/TST: Use tm.external_error_raised for test_from_arrow_respecting_given_dtype_unsafe) (#60545) Backport PR #60544: CI/TST: Use tm.external_error_raised for test_from_arrow_respecting_given_dtype_unsafe Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 03ab7c7f1dad8..470ca0673c60e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1637,7 +1637,7 @@ def test_from_arrow_respecting_given_dtype(): def test_from_arrow_respecting_given_dtype_unsafe(): array = pa.array([1.5, 2.5], type=pa.float64()) - with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"): + with tm.external_error_raised(pa.ArrowInvalid): array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get) From 9052c9eb2a16324e7c210991e4e96c64ae5e48f3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:03:08 -0800 Subject: [PATCH 338/396] CI: Ignore prompting in test-arm when apt-get installing (#60547) --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 50ff7a81ae103..2c52d7aee4e28 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -15,7 +15,6 @@ jobs: - checkout - run: .circleci/setup_env.sh - run: | - sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH \ LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD \ ci/run_tests.sh From 0c6959d6cfdd5c5672f20a8d64a9cdc601387bd9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 13 Dec 2024 14:19:44 +0100 Subject: [PATCH 339/396] [2.3.x] CI: update fastparquet xfails (#60559) CI: update fastparquet xfails --- pandas/tests/io/test_fsspec.py | 6 +++++- pandas/tests/io/test_gcs.py | 3 --- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 5ed64e3eb0958..cf59e3e4c4934 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -5,6 +5,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, date_range, @@ -168,7 +170,9 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet" +) def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index c7671bfb513aa..9fc0f6eb47766 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under17p0 from pandas import ( @@ -196,7 +194,6 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") From ffe079172156a3d4213b2c4a64d929bc1e7b6e69 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 13 Dec 2024 14:20:10 +0100 Subject: [PATCH 340/396] [2.3.x] COMPAT: fix construct_1d_object_array_from_listlike for older numpy (#60558) COMPAT: fix construct_1d_object_array_from_listlike for older numpy --- pandas/core/dtypes/cast.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f6fcd887f4528..d4263f7488a14 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1606,7 +1606,10 @@ def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray: """ # numpy will try to interpret nested lists as further dimensions in np.array(), # hence explicitly making a 1D array using np.fromiter - return np.fromiter(values, dtype="object", count=len(values)) + result = np.empty(len(values), dtype="object") + for i, obj in enumerate(values): + result[i] = obj + return result def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray: From 6d9a2b4b5b2395b3438a057ce75f0857cf7a5109 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:56:15 -0800 Subject: [PATCH 341/396] Backport PR #60553: TST: filter possible RuntimeWarning in tests (#60555) * Backport PR #60553: TST: filter possible RuntimeWarning in tests * add more ignores --- pandas/tests/extension/test_interval.py | 25 +++++++++++++++++++ pandas/tests/frame/methods/test_to_numpy.py | 4 +++ pandas/tests/frame/test_constructors.py | 3 +++ pandas/tests/groupby/test_categorical.py | 3 +++ pandas/tests/groupby/test_groupby.py | 1 + pandas/tests/indexes/interval/test_astype.py | 6 +++++ pandas/tests/indexes/interval/test_formats.py | 3 +++ .../tests/indexes/interval/test_indexing.py | 3 +++ pandas/tests/indexes/test_setops.py | 1 + pandas/tests/io/excel/test_writers.py | 3 +++ pandas/tests/reshape/test_cut.py | 1 + 11 files changed, 53 insertions(+) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 98dd1c5cb615f..6292e6051aa90 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -90,6 +90,31 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object(self, data): + super().test_hash_pandas_object(data) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object_works(self, data, as_frame): + super().test_hash_pandas_object_works(data, as_frame) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_str(self, data): + super().test_astype_str(data) + # TODO: either belongs in tests.arrays.interval or move into base tests. def test_fillna_non_scalar_raises(data_missing): diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index bdb9b2c055061..0731750aed0cf 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas.util._test_decorators as td @@ -41,6 +42,9 @@ def test_to_numpy_copy(self, using_copy_on_write): else: assert df.to_numpy(copy=False, na_value=np.nan).base is arr + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fd770b368c9da..f16068e0b6538 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2450,6 +2450,9 @@ def test_construct_with_two_categoricalindex_series(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 ser = Series(range(100)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 447df952fd0e5..cba02ae869889 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -67,6 +67,7 @@ def f(a): } +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_apply_use_categorical_name(df): cats = qcut(df.C, 4) @@ -338,6 +339,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_observed(request, using_infer_string, observed): # multiple groupers, don't re-expand the output space # of the grouper @@ -1556,6 +1558,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( assert (res.loc[unobserved_cats] == expected).all().all() +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_series_groupby_categorical_aggregation_getitem(): # GH 8870 d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9b362164c6149..b5588898d4580 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3118,6 +3118,7 @@ def test_groupby_numeric_only_std_no_result(numeric_only): dfgb.std(numeric_only=numeric_only) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_grouping_with_categorical_interval_columns(): # GH#34164 df = DataFrame({"x": [0.1, 0.2, 0.3, -0.4, 0.5], "w": ["a", "b", "a", "c", "a"]}) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 59c555b9644a1..dde5f38074efb 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -186,6 +186,12 @@ def test_subtype_datetimelike(self, index, subtype): with pytest.raises(TypeError, match=msg): index.astype(dtype) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_category(self, index): + super().test_astype_category(index) + class TestDatetimelikeSubtype(AstypeTests): """Tests specific to IntervalIndex with datetime-like subtype""" diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index f858ae137ca4e..73bbfc91028b3 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -59,6 +59,9 @@ def test_repr_floats(self): expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) @pytest.mark.parametrize( "tuples, closed, expected_data", [ diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index fd03047b2c127..b5be7e0713cdf 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -341,6 +341,9 @@ def test_get_indexer_categorical(self, target, ordered): expected = index.get_indexer(target) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_get_indexer_categorical_with_nans(self): # GH#41934 nans in both index and in target ii = IntervalIndex.from_breaks(range(5)) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 3845744dc0717..f6a865ccbb3a0 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -519,6 +519,7 @@ def test_intersection_difference_match_empty(self, index, sort): tm.assert_index_equal(inter, diff, exact=True) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 57091b268a9db..f133423bc6a85 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -755,6 +755,9 @@ def test_excel_date_datetime_format(self, ext, path): # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_excel_interval_no_labels(self, path, using_infer_string): # see gh-19242 # diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 0811c69859c0d..cab2302b3d877 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -727,6 +727,7 @@ def test_cut_with_duplicated_index_lowest_included(): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_cut_with_nonexact_categorical_indices(): # GH 42424 From 3362822db00076222d3875a86b5f5625799d8862 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 17 Dec 2024 08:40:01 +0100 Subject: [PATCH 342/396] String dtype (2.3.x): avoid downcasting object to string in fillna/where/interpolate (#60183) --- pandas/_libs/lib.pyi | 3 ++ pandas/_libs/lib.pyx | 7 +++- pandas/core/internals/blocks.py | 38 ++++++++++++++++++--- pandas/tests/frame/methods/test_fillna.py | 21 +++--------- pandas/tests/frame/methods/test_replace.py | 37 +++----------------- pandas/tests/indexing/test_coercion.py | 6 +++- pandas/tests/series/methods/test_replace.py | 3 -- 7 files changed, 57 insertions(+), 58 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index b9fd970e68f5b..71a4d3ae2575f 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -86,6 +86,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: Literal[False] = ..., + convert_string: Literal[False] = ..., convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @@ -97,6 +98,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @@ -108,6 +110,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bc039917aef87..f72d6a5dad877 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, + bint convert_string=True, object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2747,7 +2748,11 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype() return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif using_string_dtype() and is_string_array(objects, skipna=True): + elif ( + convert_string + and using_string_dtype() + and is_string_array(objects, skipna=True) + ): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(na_value=np.nan) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6ae591a5d4ac8..5be83aa38011b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -563,7 +563,12 @@ def _maybe_downcast( return blocks nbs = extend_blocks( - [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] + [ + blk.convert( + using_cow=using_cow, copy=not using_cow, convert_string=False + ) + for blk in blocks + ] ) if caller == "fillna": if len(nbs) != len(blocks) or not all( @@ -636,6 +641,7 @@ def convert( *, copy: bool = True, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Attempt to coerce any object types to better types. Return a copy @@ -648,7 +654,10 @@ def convert( if self.ndim != 1 and self.shape[0] != 1: blocks = self.split_and_operate( - Block.convert, copy=copy, using_cow=using_cow + Block.convert, + copy=copy, + using_cow=using_cow, + convert_string=convert_string, ) if all(blk.dtype.kind == "O" for blk in blocks): # Avoid fragmenting the block if convert is a no-op @@ -666,6 +675,7 @@ def convert( res_values = lib.maybe_convert_objects( values, # type: ignore[arg-type] convert_non_numeric=True, + convert_string=convert_string, ) refs = None if ( @@ -851,6 +861,7 @@ def replace( mask: npt.NDArray[np.bool_] | None = None, using_cow: bool = False, already_warned=None, + convert_string=None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -915,7 +926,11 @@ def replace( if get_option("future.no_silent_downcasting") is True: blocks = [blk] else: - blocks = blk.convert(copy=False, using_cow=using_cow) + blocks = blk.convert( + copy=False, + using_cow=using_cow, + convert_string=convert_string or self.dtype != _dtype_obj, + ) if len(blocks) > 1 or blocks[0].dtype != blk.dtype: warnings.warn( # GH#54710 @@ -944,6 +959,7 @@ def replace( inplace=True, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) else: @@ -958,6 +974,7 @@ def replace( inplace=True, mask=mask[i : i + 1], using_cow=using_cow, + convert_string=convert_string, ) ) return blocks @@ -970,6 +987,7 @@ def _replace_regex( inplace: bool = False, mask=None, using_cow: bool = False, + convert_string: bool = True, already_warned=None, ) -> list[Block]: """ @@ -1029,7 +1047,9 @@ def _replace_regex( ) already_warned.warned_already = True - nbs = block.convert(copy=False, using_cow=using_cow) + nbs = block.convert( + copy=False, using_cow=using_cow, convert_string=convert_string + ) opt = get_option("future.no_silent_downcasting") if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: warnings.warn( @@ -1068,6 +1088,8 @@ def replace_list( values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] + convert_string = self.dtype != _dtype_obj + # Exclude anything that we know we won't contain pairs = [ (x, y) @@ -1152,6 +1174,7 @@ def replace_list( inplace=inplace, regex=regex, using_cow=using_cow, + convert_string=convert_string, ) if using_cow and i != src_len: @@ -1174,7 +1197,9 @@ def replace_list( nbs = [] for res_blk in result: converted = res_blk.convert( - copy=True and not using_cow, using_cow=using_cow + copy=True and not using_cow, + using_cow=using_cow, + convert_string=convert_string, ) if len(converted) > 1 or converted[0].dtype != res_blk.dtype: warnings.warn( @@ -1204,6 +1229,7 @@ def _replace_coerce( inplace: bool = True, regex: bool = False, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another @@ -1233,6 +1259,7 @@ def _replace_coerce( inplace=inplace, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) else: if value is None: @@ -1256,6 +1283,7 @@ def _replace_coerce( inplace=inplace, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) # --------------------------------------------------------------------- diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index e2baa2567f5b4..9844122dc4b2d 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -132,21 +132,14 @@ def test_fillna_different_dtype(self, using_infer_string): [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna({2: "foo"}) - else: - result = df.fillna({2: "foo"}) + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) + expected[2] = expected[2].astype("object") tm.assert_frame_equal(result, expected) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - return_value = df.fillna({2: "foo"}, inplace=True) - else: - return_value = df.fillna({2: "foo"}, inplace=True) + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -385,12 +378,8 @@ def test_fillna_dtype_conversion(self, using_infer_string): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna("nan") - else: - result = df.fillna("nan") - expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + result = df.fillna("nan") + expected = DataFrame("nan", index=range(3), columns=["A", "B"], dtype=object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 8df9893e73766..2ee878893ce70 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -281,20 +281,12 @@ def test_regex_replace_dict_nested(self, mix_abc): tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character( - self, any_string_dtype, using_infer_string - ): + def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) - - else: - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) def test_regex_replace_dict_nested_gh4115(self): @@ -429,31 +421,12 @@ def test_replace_regex_metachar(self, metachar): ], ) def test_regex_replace_string_types( - self, - data, - to_replace, - expected, - frame_or_series, - any_string_dtype, - using_infer_string, - request, + self, data, to_replace, expected, frame_or_series, any_string_dtype ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - if len(to_replace) > 1 and isinstance(obj, DataFrame): - request.node.add_marker( - pytest.mark.xfail( - reason="object input array that gets downcasted raises on " - "second pass" - ) - ) - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = obj.replace(to_replace, regex=True) - dtype = "str" - else: - result = obj.replace(to_replace, regex=True) + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index ac3bfe3a13a44..4e1697eabf734 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -831,7 +831,7 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - def test_replace_series(self, how, to_key, from_key, replacer): + def test_replace_series(self, how, to_key, from_key, replacer, using_infer_string): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") obj = obj.astype(from_key) @@ -856,6 +856,10 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") + if using_infer_string and exp.dtype == "string" and obj.dtype == object: + # with infer_string, we disable the deprecated downcasting behavior + exp = exp.astype(object) + msg = "Downcasting behavior in `replace`" warn = FutureWarning if ( diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 0b0cf57a70c3f..0c2e0fdc2616f 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -768,7 +766,6 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_change_dtype_series(self): # GH#25797 df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object) From eb22bf8f9c194a96b6d7583504ab7ca217238881 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 17 Dec 2024 16:01:33 -0800 Subject: [PATCH 343/396] Backport PR #60584 on branch 2.3.x (TST: Address matplotlib 3.10 deprecation of vert=) (#60586) * Backport PR #60584: TST: Address matplotlib 3.10 deprecation of vert= * Add missing import * Ignore pre-commit check --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/plotting/_matplotlib/boxplot.py | 5 +- pandas/plotting/_matplotlib/tools.py | 2 +- pandas/tests/plotting/frame/test_frame.py | 41 +++++++++++++--- pandas/tests/plotting/test_boxplot_method.py | 51 +++++++++++++++----- 4 files changed, 76 insertions(+), 23 deletions(-) diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index d2b76decaa75d..80f0349b205e6 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -7,6 +7,7 @@ ) import warnings +import matplotlib as mpl from matplotlib.artist import setp import numpy as np @@ -20,6 +21,7 @@ import pandas as pd import pandas.core.common as com +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import ( @@ -54,7 +56,8 @@ def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> ticks = ax.get_xticks() if is_vertical else ax.get_yticks() if len(ticks) != len(labels): i, remainder = divmod(len(ticks), len(labels)) - assert remainder == 0, remainder + if Version(mpl.__version__) < Version("3.10"): + assert remainder == 0, remainder labels *= i if is_vertical: ax.set_xticklabels(labels, **kwargs) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 898b5b25e7b01..98441c5afbaa4 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -57,7 +57,7 @@ def format_date_labels(ax: Axes, rot) -> None: fig = ax.get_figure() if fig is not None: # should always be a Figure but can technically be None - maybe_adjust_figure(fig, bottom=0.2) + maybe_adjust_figure(fig, bottom=0.2) # type: ignore[arg-type] def table( diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 4ca4067214bbd..33366b4eabba5 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1059,28 +1059,43 @@ def test_boxplot_series_positions(self, hist_df): tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] # if horizontal, yticklabels are rotated - ax = df.plot.box(rot=50, fontsize=8, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(rot=50, fontsize=8, **kwargs) _check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) _check_text_labels(ax.get_yticklabels(), labels) assert len(ax.lines) == 7 * len(numeric_cols) - @pytest.mark.filterwarnings("ignore:Attempt:UserWarning") + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib version 3.10", + ) def test_boxplot_vertical_subplots(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) axes = _check_plot_works( - df.plot.box, - default_axes=True, - subplots=True, - vert=False, - logx=True, + df.plot.box, default_axes=True, subplots=True, logx=True, **kwargs ) _check_axes_shape(axes, axes_num=3, layout=(1, 3)) _check_ax_scales(axes, xaxis="log") @@ -1088,12 +1103,22 @@ def test_boxplot_vertical_subplots(self, hist_df): _check_text_labels(ax.get_yticklabels(), [label]) assert len(ax.lines) == 7 + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical_positions(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] positions = np.array([3, 2, 8]) - ax = df.plot.box(positions=positions, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(positions=positions, **kwargs) _check_text_labels(ax.get_yticklabels(), labels) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 76f7fa1f22eec..e1b03a34086c0 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,5 +1,7 @@ """ Test cases for .boxplot method """ +from __future__ import annotations + import itertools import string @@ -22,6 +24,7 @@ _check_ticks_props, _check_visible, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -35,6 +38,17 @@ def _check_ax_limits(col, ax): assert y_max >= col.max() +if Version(mpl.__version__) < Version("3.10"): + verts: list[dict[str, bool | str]] = [{"vert": False}, {"vert": True}] +else: + verts = [{"orientation": "horizontal"}, {"orientation": "vertical"}] + + +@pytest.fixture(params=verts) +def vert(request): + return request.param + + class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 @@ -315,7 +329,7 @@ def test_specified_props_kwd(self, props, expected): assert result[expected][0].get_color() == "C1" - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -325,11 +339,11 @@ def test_plot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.plot(kind="box", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.plot(kind="box", xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) @@ -338,14 +352,14 @@ def test_plot_box(self, vert): xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) - df1.plot.box(ax=axs[0], vert=vert, xlabel=xlabel, ylabel=ylabel) - df2.plot.box(ax=axs[1], vert=vert, xlabel=xlabel, ylabel=ylabel) + df1.plot.box(ax=axs[0], xlabel=xlabel, ylabel=ylabel, **vert) + df2.plot.box(ax=axs[1], xlabel=xlabel, ylabel=ylabel, **vert) for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel mpl.pyplot.close() - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -355,11 +369,11 @@ def test_boxplot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_group_xlabel_ylabel(self, vert): df = DataFrame( { @@ -369,14 +383,20 @@ def test_boxplot_group_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(by="group", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(by="group", xlabel=xlabel, ylabel=ylabel, **vert) for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel mpl.pyplot.close() - @pytest.mark.parametrize("vert", [True, False]) - def test_boxplot_group_no_xlabel_ylabel(self, vert): + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + def test_boxplot_group_no_xlabel_ylabel(self, vert, request): + if Version(mpl.__version__) >= Version("3.10") and vert == { + "orientation": "horizontal" + }: + request.applymarker( + pytest.mark.xfail(reason=f"{vert} fails starting with matplotlib 3.10") + ) df = DataFrame( { "a": np.random.default_rng(2).standard_normal(10), @@ -384,9 +404,14 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): "group": np.random.default_rng(2).choice(["group1", "group2"], 10), } ) - ax = df.boxplot(by="group", vert=vert) + ax = df.boxplot(by="group", **vert) for subplot in ax: - target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() + target_label = ( + subplot.get_xlabel() + if vert == {"vert": True} # noqa: PLR1714 + or vert == {"orientation": "vertical"} + else subplot.get_ylabel() + ) assert target_label == pprint_thing(["group"]) mpl.pyplot.close() From 75a1007e6c40ec765fb3764935e84bb34acf0163 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 18 Dec 2024 21:21:32 +0100 Subject: [PATCH 344/396] [backport 2.3.x] TST (string dtype): un-xfail string tests specific to object dtype (#59433) (#60180) Co-authored-by: jbrockmendel --- pandas/tests/copy_view/test_interp_fillna.py | 13 ++++------ pandas/tests/copy_view/test_replace.py | 3 +-- pandas/tests/test_algos.py | 26 ++++++++++++++------ 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 338b76cbf1e7a..d0c4fa53faab9 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( NA, ArrowDtype, @@ -137,10 +135,9 @@ def test_interp_fill_functions_inplace( assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_interpolate_cleaned_fill_method(using_copy_on_write): - # Check that "method is set to None" case works correctly +def test_interpolate_cannot_with_object_dtype(using_copy_on_write): df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) + df["a"] = df["a"].astype(object) df_orig = df.copy() msg = "DataFrame.interpolate with object dtype" @@ -159,16 +156,16 @@ def test_interpolate_cleaned_fill_method(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_interpolate_object_convert_no_op(using_copy_on_write): +def test_interpolate_object_convert_no_op(using_copy_on_write, using_infer_string): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) + df["a"] = df["a"].astype(object) arr_a = get_array(df, "a") msg = "DataFrame.interpolate with method=pad is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): df.interpolate(method="pad", inplace=True) # Now CoW makes a copy, it should not! - if using_copy_on_write: + if using_copy_on_write and not using_infer_string: assert df._mgr._has_no_reference(0) assert np.shares_memory(arr_a, get_array(df, "a")) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index bc3edb1f72214..9e24ce319e3bf 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -356,10 +356,9 @@ def test_replace_empty_list(using_copy_on_write): assert not df2._mgr._has_no_reference(0) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(using_copy_on_write, value): - df = DataFrame({"a": ["a", "b", "c"]}) + df = DataFrame({"a": ["a", "b", "c"]}, dtype=object) arr = get_array(df, "a") df.replace(["c"], value, inplace=True) if using_copy_on_write or value is None: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index d1e69cfa2b4ee..80ee0f6e067f9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1704,12 +1704,17 @@ def test_unique_complex_numbers(self, array, expected): class TestHashTable: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ - (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), - (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + ( + ht.PyObjectHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), + ( + ht.StringHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), @@ -1717,7 +1722,7 @@ class TestHashTable: ) def test_hashtable_unique(self, htable, data, writable): # output of maker has guaranteed unique elements - s = Series(data) + s = Series(data, dtype=data.dtype) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan @@ -1744,12 +1749,17 @@ def test_hashtable_unique(self, htable, data, writable): reconstr = result_unique[result_inverse] tm.assert_numpy_array_equal(reconstr, s_duplicated.values) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ - (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), - (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + ( + ht.PyObjectHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), + ( + ht.StringHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), @@ -1757,7 +1767,7 @@ def test_hashtable_unique(self, htable, data, writable): ) def test_hashtable_factorize(self, htable, writable, data): # output of maker has guaranteed unique elements - s = Series(data) + s = Series(data, dtype=data.dtype) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan From c07933716ef30860e66373b10fd0177c22cb5970 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 19 Dec 2024 09:42:30 +0100 Subject: [PATCH 345/396] [backport 2.3.x] TST (string dtype): resolve xfails in pandas/tests/copy_view (#60245) (#60257) --- pandas/_testing/__init__.py | 28 +++++---------- pandas/tests/copy_view/test_astype.py | 22 ++++++------ pandas/tests/copy_view/test_functions.py | 1 - pandas/tests/copy_view/test_methods.py | 43 +++++++++++++----------- pandas/tests/copy_view/test_replace.py | 18 ++++------ 5 files changed, 51 insertions(+), 61 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 2d066b581f1c6..d7197f23ce1e4 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -8,7 +8,6 @@ TYPE_CHECKING, Callable, ContextManager, - cast, ) import warnings @@ -23,8 +22,6 @@ from pandas.compat import pa_version_under10p1 -from pandas.core.dtypes.common import is_string_dtype - import pandas as pd from pandas import ( ArrowDtype, @@ -83,8 +80,8 @@ with_csv_dialect, ) from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, - ExtensionArray, NumpyExtensionArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray @@ -96,7 +93,6 @@ NpDtype, ) - from pandas.core.arrays import ArrowExtensionArray UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"] @@ -530,24 +526,18 @@ def shares_memory(left, right) -> bool: if isinstance(left, pd.core.arrays.IntervalArray): return shares_memory(left._left, right) or shares_memory(left._right, right) - if ( - isinstance(left, ExtensionArray) - and is_string_dtype(left.dtype) - and left.dtype.storage == "pyarrow" # type: ignore[attr-defined] - ): - # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 - left = cast("ArrowExtensionArray", left) - if ( - isinstance(right, ExtensionArray) - and is_string_dtype(right.dtype) - and right.dtype.storage == "pyarrow" # type: ignore[attr-defined] - ): - right = cast("ArrowExtensionArray", right) + if isinstance(left, ArrowExtensionArray): + if isinstance(right, ArrowExtensionArray): + # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left_pa_data = left._pa_array right_pa_data = right._pa_array left_buf1 = left_pa_data.chunk(0).buffers()[1] right_buf1 = right_pa_data.chunk(0).buffers()[1] - return left_buf1 == right_buf1 + return left_buf1.address == right_buf1.address + else: + # if we have one one ArrowExtensionArray and one other array, assume + # they can only share memory if they share the same numpy buffer + return np.shares_memory(left, right) if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray): # By convention, we'll say these share memory if they share *either* diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index e0e3f6dc058a4..45fc3333c49a7 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -244,7 +242,6 @@ def test_astype_arrow_timestamp(using_copy_on_write): ) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_convert_dtypes_infer_objects(using_copy_on_write): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() @@ -256,7 +253,7 @@ def test_convert_dtypes_infer_objects(using_copy_on_write): ) if using_copy_on_write: - assert np.shares_memory(get_array(ser), get_array(result)) + assert tm.shares_memory(get_array(ser), get_array(result)) else: assert not np.shares_memory(get_array(ser), get_array(result)) @@ -264,17 +261,21 @@ def test_convert_dtypes_infer_objects(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") -def test_convert_dtypes(using_copy_on_write): +def test_convert_dtypes(using_copy_on_write, using_infer_string): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() df2 = df.convert_dtypes() if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + if using_infer_string and HAS_PYARROW: + # TODO the default nullable string dtype still uses python storage + # this should be changed to pyarrow if installed + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) @@ -282,4 +283,5 @@ def test_convert_dtypes(using_copy_on_write): assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d")) df2.iloc[0, 0] = "x" + df2.iloc[0, 1] = 10 tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 23ed7f9edcd22..eefd27964e6ae 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -201,7 +201,6 @@ def test_concat_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 295d93580f451..09738fe1023fb 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW from pandas.errors import SettingWithCopyWarning @@ -953,15 +951,19 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") -def test_infer_objects(using_copy_on_write): - df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) +def test_infer_objects(using_copy_on_write, using_infer_string): + df = DataFrame( + {"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"} + ) df_orig = df.copy() df2 = df.infer_objects() if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + if using_infer_string: + assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + else: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) @@ -975,19 +977,16 @@ def test_infer_objects(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) -def test_infer_objects_no_reference(using_copy_on_write): +def test_infer_objects_no_reference(using_copy_on_write, using_infer_string): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" ), - "e": "b", + "e": Series(["z", "w"], dtype=object), } ) df = df.infer_objects() @@ -1001,16 +1000,22 @@ def test_infer_objects_no_reference(using_copy_on_write): df.iloc[0, 3] = Timestamp("2018-12-31") if using_copy_on_write: assert np.shares_memory(arr_a, get_array(df, "a")) - # TODO(CoW): Block splitting causes references here - assert not np.shares_memory(arr_b, get_array(df, "b")) + if using_infer_string: + # note that the underlying memory of arr_b has been copied anyway + # because of the assignment, but the EA is updated inplace so still + # appears the share memory + assert tm.shares_memory(arr_b, get_array(df, "b")) + else: + # TODO(CoW): Block splitting causes references here + assert not np.shares_memory(arr_b, get_array(df, "b")) assert np.shares_memory(arr_d, get_array(df, "d")) -def test_infer_objects_reference(using_copy_on_write): +def test_infer_objects_reference(using_copy_on_write, using_infer_string): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" @@ -1029,7 +1034,8 @@ def test_infer_objects_reference(using_copy_on_write): df.iloc[0, 3] = Timestamp("2018-12-31") if using_copy_on_write: assert not np.shares_memory(arr_a, get_array(df, "a")) - assert not np.shares_memory(arr_b, get_array(df, "b")) + if not using_infer_string or HAS_PYARROW: + assert not np.shares_memory(arr_b, get_array(df, "b")) assert np.shares_memory(arr_d, get_array(df, "d")) @@ -1184,7 +1190,6 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("decimals", [-1, 0, 1]) def test_round(using_copy_on_write, warn_copy_on_write, decimals): df = DataFrame({"a": [1, 2], "b": "c"}) @@ -1192,7 +1197,7 @@ def test_round(using_copy_on_write, warn_copy_on_write, decimals): df2 = df.round(decimals=decimals) if using_copy_on_write: - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) # TODO: Make inplace by using out parameter of ndarray.round? if decimals >= 0: # Ensure lazy copy if no-op diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 9e24ce319e3bf..c6c9eca47f3f4 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas import ( Categorical, DataFrame, @@ -14,7 +10,6 @@ from pandas.tests.copy_view.util import get_array -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "replace_kwargs", [ @@ -31,7 +26,7 @@ ], ) def test_replace(using_copy_on_write, replace_kwargs): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df_replaced = df.replace(**replace_kwargs) @@ -39,7 +34,7 @@ def test_replace(using_copy_on_write, replace_kwargs): if using_copy_on_write: if (df_replaced["b"] == df["b"]).all(): assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) # mutating squeezed df triggers a copy-on-write for that column/block df_replaced.loc[0, "c"] = -1 @@ -61,26 +56,25 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): with tm.assert_cow_warning(warn_copy_on_write): df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) if using_copy_on_write: - assert not np.shares_memory(arr, get_array(df, "a")) + assert not tm.shares_memory(arr, get_array(df, "a")) assert df._mgr._has_no_reference(0) tm.assert_frame_equal(view, df_orig) else: assert np.shares_memory(arr, get_array(df, "a")) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_replace_regex_inplace(using_copy_on_write): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) if using_copy_on_write: assert df._mgr._has_no_reference(0) - assert np.shares_memory(arr, get_array(df, "a")) + assert tm.shares_memory(arr, get_array(df, "a")) df_orig = df.copy() df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True) tm.assert_frame_equal(df_orig, df) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) def test_replace_regex_inplace_no_op(using_copy_on_write): @@ -362,7 +356,7 @@ def test_replace_object_list_inplace(using_copy_on_write, value): arr = get_array(df, "a") df.replace(["c"], value, inplace=True) if using_copy_on_write or value is None: - assert np.shares_memory(arr, get_array(df, "a")) + assert tm.shares_memory(arr, get_array(df, "a")) else: # This could be inplace assert not np.shares_memory(arr, get_array(df, "a")) From cfdcf0e55f7ee80f6a2c834eff12f2105eb3c2be Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 22 Dec 2024 13:37:26 +0100 Subject: [PATCH 346/396] [backport 2.3.x] TST (string dtype): clean-up assorted xfails (#60345) (#60349) TST (string dtype): clean-up assorted xfails (#60345) (cherry picked from commit e7d1964ab7405d54d919bb289318d01e9eb72cd1) --- pandas/tests/base/test_conversion.py | 9 ++------- pandas/tests/indexes/multi/test_setops.py | 5 +---- pandas/tests/indexes/test_base.py | 12 +----------- pandas/tests/io/excel/test_readers.py | 1 - pandas/tests/io/excel/test_writers.py | 5 +---- pandas/tests/reshape/test_union_categoricals.py | 9 +++++---- 6 files changed, 10 insertions(+), 31 deletions(-) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index e2bf19e2e736c..a65ab4d287d11 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 @@ -391,9 +389,6 @@ def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): assert np.may_share_memory(result_nocopy1, result_nocopy2) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize( "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] @@ -405,13 +400,13 @@ def test_to_numpy_copy(arr, as_series, using_infer_string): # no copy by default result = obj.to_numpy() - if using_infer_string and arr.dtype == object: + if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow": assert np.shares_memory(arr, result) is False else: assert np.shares_memory(arr, result) is True result = obj.to_numpy(copy=False) - if using_infer_string and arr.dtype == object: + if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow": assert np.shares_memory(arr, result) is False else: assert np.shares_memory(arr, result) is True diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 31a5d2fb906eb..801a813955b41 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( CategoricalIndex, @@ -760,13 +758,12 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): tm.assert_index_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_union_with_na_when_constructing_dataframe(): # GH43222 series1 = Series( (1,), index=MultiIndex.from_arrays( - [Series([None], dtype="string"), Series([None], dtype="string")] + [Series([None], dtype="str"), Series([None], dtype="str")] ), ) series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b")))) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 3bcc62445f0ac..e3b8a60354b61 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -8,12 +8,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import ( - HAS_PYARROW, - IS64, -) +from pandas.compat import IS64 from pandas.errors import InvalidIndexError import pandas.util._test_decorators as td @@ -862,11 +857,6 @@ def test_isin(self, values, index, expected): result = index.isin(values) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_isin_nan_common_object( self, nulls_fixture, nulls_fixture2, using_infer_string ): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 3c5e1e1cf5afb..7c10b3e8661ef 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -591,7 +591,6 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(self, read_ext, dtype_backend, engine): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index f133423bc6a85..d6e99de4f9d91 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows from pandas.compat._constants import PY310 from pandas.compat._optional import import_optional_dependency @@ -1316,12 +1314,11 @@ def test_freeze_panes(self, path): result = pd.read_excel(path, index_col=0) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_path_lib(self, engine, ext): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(30)]), ) writer = partial(df.to_excel, engine=engine) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 1d5d16f39e648..081feae6fc43f 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.concat import union_categoricals import pandas as pd @@ -124,12 +122,15 @@ def test_union_categoricals_nan(self): exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [[], ["1"]]) def test_union_categoricals_empty(self, val, request, using_infer_string): # GH 13759 if using_infer_string and val == ["1"]: - request.applymarker(pytest.mark.xfail("object and strings dont match")) + request.applymarker( + pytest.mark.xfail( + reason="TDOD(infer_string) object and strings dont match" + ) + ) res = union_categoricals([Categorical([]), Categorical(val)]) exp = Categorical(val) tm.assert_categorical_equal(res, exp) From 99ae39e4605fb018bdd4a9e50aa757cea4768989 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 Dec 2024 21:50:23 +0100 Subject: [PATCH 347/396] [2.3.x] TST: remove leftover xfail in excel test_readers.py (#60599) --- pandas/tests/io/excel/test_readers.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 7c10b3e8661ef..c62144adbaecb 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -16,8 +16,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -655,9 +653,6 @@ def test_dtype_backend_and_dtype(self, read_ext): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_string_dtype(), reason="infer_string takes precedence", strict=False - ) def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): From e350f10978aa3b4b7a5f90a0cd9d39b030b4f959 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Dec 2024 15:53:47 +0100 Subject: [PATCH 348/396] [backport 2.3.x] TST (string dtype): resolve xfails for frame fillna and replace tests + fix bug in replace for string (#60295) (#60331) * TST (string dtype): resolve xfails for frame fillna and replace tests + fix bug in replace for string (#60295) (cherry picked from commit fae3e8034faf66eb8ef00bcbed73d48e4ef791d3) * fix tests for default mode * fixes * cleanup * update indexing tests --- pandas/core/array_algos/replace.py | 2 + pandas/core/internals/blocks.py | 25 +++++-- pandas/tests/frame/methods/test_fillna.py | 23 ++---- pandas/tests/frame/methods/test_replace.py | 81 ++++++++++++---------- pandas/tests/indexing/test_coercion.py | 7 +- 5 files changed, 75 insertions(+), 63 deletions(-) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 5f377276be480..7d40fb985a593 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -149,4 +149,6 @@ def re_replacer(s): if mask is None: values[:] = f(values) else: + if values.ndim != mask.ndim: + mask = np.broadcast_to(mask, values.shape) values[mask] = f(values[mask]) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5be83aa38011b..452c919449ec4 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -929,7 +929,7 @@ def replace( blocks = blk.convert( copy=False, using_cow=using_cow, - convert_string=convert_string or self.dtype != _dtype_obj, + convert_string=convert_string or self.dtype == "string", ) if len(blocks) > 1 or blocks[0].dtype != blk.dtype: warnings.warn( @@ -987,7 +987,7 @@ def _replace_regex( inplace: bool = False, mask=None, using_cow: bool = False, - convert_string: bool = True, + convert_string=None, already_warned=None, ) -> list[Block]: """ @@ -1048,10 +1048,18 @@ def _replace_regex( already_warned.warned_already = True nbs = block.convert( - copy=False, using_cow=using_cow, convert_string=convert_string + copy=False, + using_cow=using_cow, + convert_string=convert_string or self.dtype == "string", ) opt = get_option("future.no_silent_downcasting") - if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: + if ( + len(nbs) > 1 + or ( + nbs[0].dtype != block.dtype + and not (self.dtype == "string" and nbs[0].dtype == "string") + ) + ) and not opt: warnings.warn( # GH#54710 "Downcasting behavior in `replace` is deprecated and " @@ -1088,7 +1096,7 @@ def replace_list( values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] - convert_string = self.dtype != _dtype_obj + convert_string = self.dtype == "string" # Exclude anything that we know we won't contain pairs = [ @@ -2167,6 +2175,13 @@ def where( if isinstance(self.dtype, (IntervalDtype, StringDtype)): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other) + if ( + self.ndim == 2 + and isinstance(orig_cond, np.ndarray) + and orig_cond.ndim == 1 + and not is_1d_only_ea_dtype(blk.dtype) + ): + orig_cond = orig_cond[:, None] nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) return self._maybe_downcast( nbs, downcast=_downcast, using_cow=using_cow, caller="where" diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 9844122dc4b2d..c0fc72768e27f 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas import ( @@ -91,8 +89,6 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(ValueError, match=msg): datetime_frame.fillna(5, method="ffill") - # TODO(infer_string) test as actual error instead of xfail - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -126,7 +122,7 @@ def test_fillna_empty(self, using_copy_on_write): df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) - def test_fillna_different_dtype(self, using_infer_string): + def test_fillna_different_dtype(self): # with different dtype (GH#3386) df = DataFrame( [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] @@ -136,6 +132,7 @@ def test_fillna_different_dtype(self, using_infer_string): expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) + # column is originally float (all-NaN) -> filling with string gives object dtype expected[2] = expected[2].astype("object") tm.assert_frame_equal(result, expected) @@ -654,18 +651,10 @@ def test_fillna_col_reordering(self): filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() - # TODO(infer_string) test as actual error instead of xfail - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") - def test_fill_corner(self, float_frame, float_string_frame): - mf = float_string_frame - mf.loc[mf.index[5:20], "foo"] = np.nan - mf.loc[mf.index[-10:], "A"] = np.nan - - filled = float_string_frame.fillna(value=0) - assert (filled.loc[filled.index[5:20], "foo"] == 0).all() - del float_string_frame["foo"] - - float_frame.reindex(columns=[]).fillna(value=0) + def test_fill_empty(self, float_frame): + df = float_frame.reindex(columns=[]) + result = df.fillna(value=0) + tm.assert_frame_equal(result, df) def test_fillna_downcast_dict(self): # GH#40809 diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 2ee878893ce70..0971fb7e604c0 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -30,7 +28,6 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -46,7 +43,9 @@ def test_replace_inplace(self, datetime_frame, float_string_frame): mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, 0) - expected = float_string_frame.fillna(value=0) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=0) tm.assert_frame_equal(result, expected) tsframe = datetime_frame.copy() @@ -290,34 +289,39 @@ def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): tm.assert_frame_equal(result, expected) def test_regex_replace_dict_nested_gh4115(self): - df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) + df = DataFrame( + {"Type": Series(["Q", "T", "Q", "Q", "T"], dtype=object), "tmp": 2} + ) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) msg = "Downcasting behavior in `replace`" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.replace({"Type": {"Q": 0, "T": 1}}) + tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_regex_replace_list_to_scalar(self, mix_abc): + def test_regex_replace_list_to_scalar(self, mix_abc, using_infer_string): df = DataFrame(mix_abc) expec = DataFrame( { "a": mix_abc["a"], - "b": np.array([np.nan] * 4), + "b": [np.nan] * 4, "c": [np.nan, np.nan, np.nan, "d"], } ) + if using_infer_string: + expec["b"] = expec["b"].astype("str") msg = "Downcasting behavior in `replace`" - with tm.assert_produces_warning(FutureWarning, match=msg): + warn = None if using_infer_string else FutureWarning + with tm.assert_produces_warning(warn, match=msg): res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) res2 = df.copy() res3 = df.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): return_value = res2.replace( [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True ) assert return_value is None - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): return_value = res3.replace( regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True ) @@ -326,7 +330,6 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -342,7 +345,6 @@ def test_regex_replace_str_to_numeric(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -539,21 +541,28 @@ def test_replace_convert(self): res = rep.dtypes tm.assert_series_equal(expec, res) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, -18) - expected = float_string_frame.fillna(value=-18) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-18) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-18, np.nan), expected2) result = float_string_frame.replace(np.nan, -1e8) - expected = float_string_frame.fillna(value=-1e8) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-1e8) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-1e8, np.nan), expected2) def test_replace_mixed_int_block_upcasting(self): # int block upcasting @@ -614,15 +623,11 @@ def test_replace_mixed2(self, using_infer_string): expected = DataFrame( { - "A": Series(["foo", "bar"]), + "A": Series(["foo", "bar"], dtype="object"), "B": Series([0, "foo"], dtype="object"), } ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.replace([1, 2], ["foo", "bar"]) - else: - result = df.replace([1, 2], ["foo", "bar"]) + result = df.replace([1, 2], ["foo", "bar"]) tm.assert_frame_equal(result, expected) def test_replace_mixed3(self): @@ -931,7 +936,7 @@ def test_replace_limit(self): # TODO pass - def test_replace_dict_no_regex(self): + def test_replace_dict_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -939,7 +944,8 @@ def test_replace_dict_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = { "Agree": 4, @@ -954,7 +960,7 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) - def test_replace_series_no_regex(self): + def test_replace_series_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -962,7 +968,8 @@ def test_replace_series_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = Series( { @@ -1060,16 +1067,15 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_replace_swapping_bug(self, using_infer_string): + def test_replace_swapping_bug(self): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) df = DataFrame({"a": [0, 1, 0]}) res = df.replace({"a": {0: "Y", 1: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) def test_replace_period(self): @@ -1345,7 +1351,7 @@ def test_replace_commutative(self, df, to_replace, exp): ) def test_replace_replacer_dtype(self, replacer): # GH26632 - df = DataFrame(["a"]) + df = DataFrame(["a"], dtype=object) msg = "Downcasting behavior in `replace` " with tm.assert_produces_warning(FutureWarning, match=msg): result = df.replace({"a": replacer, "b": replacer}) @@ -1462,6 +1468,7 @@ def test_replace_value_category_type(self): input_df = input_df.replace("obj1", "obj9") result = input_df.replace("cat2", "catX") + result = result.astype({"col1": "int64", "col3": "float64", "col5": "str"}) tm.assert_frame_equal(result, expected) def test_replace_dict_category_type(self): @@ -1503,13 +1510,11 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - def test_replace_intervals(self, using_infer_string): + def test_replace_intervals(self): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) - warning = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warning, match="Downcasting"): - result = df.replace({"a": {pd.Interval(0, 1): "x"}}) - expected = DataFrame({"a": ["x", "x"]}) + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + expected = DataFrame({"a": ["x", "x"]}, dtype=object) tm.assert_frame_equal(result, expected) def test_replace_unicode(self): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 4e1697eabf734..ecc640cfd0571 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -856,7 +856,7 @@ def test_replace_series(self, how, to_key, from_key, replacer, using_infer_strin else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") - if using_infer_string and exp.dtype == "string" and obj.dtype == object: + if using_infer_string and exp.dtype == "string": # with infer_string, we disable the deprecated downcasting behavior exp = exp.astype(object) @@ -889,8 +889,9 @@ def test_replace_series_datetime_tz( assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") - if using_infer_string and to_key == "object": - assert exp.dtype == "string" + if using_infer_string and exp.dtype == "string": + # with infer_string, we disable the deprecated downcasting behavior + exp = exp.astype(object) else: assert exp.dtype == to_key From fb075b58a529a33a98e63fbdc18c5951334a351c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 29 Dec 2024 13:32:18 -0800 Subject: [PATCH 349/396] Backport PR #60614 on branch 2.3.x (TST(string dtype): Resolve to_latex xfail) (#60617) Backport PR #60614: TST(string dtype): Resolve to_latex xfail Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/io/formats/style.py | 2 +- pandas/tests/io/formats/style/test_to_latex.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index b62f7581ac220..987577057e058 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1580,7 +1580,7 @@ def _update_ctx_header(self, attrs: DataFrame, axis: AxisInt) -> None: for j in attrs.columns: ser = attrs[j] for i, c in ser.items(): - if not c: + if not c or pd.isna(c): continue css_list = maybe_convert_css_to_tuples(c) if axis == 0: diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py index b29c880d1f823..7f1443c3ee66b 100644 --- a/pandas/tests/io/formats/style/test_to_latex.py +++ b/pandas/tests/io/formats/style/test_to_latex.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, MultiIndex, @@ -731,7 +729,6 @@ def test_longtable_caption_label(styler, caption, cap_exp, label, lab_exp): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize( "columns, siunitx", From c24ac2ff75a0cb526db66d526ebbd6ae0d3cd5a3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 30 Dec 2024 12:29:25 -0500 Subject: [PATCH 350/396] Backport PR #60615: TST(string dtype): Resolve some HDF5 xfails (#60626) --- pandas/io/pytables.py | 2 + .../tests/io/pytables/test_file_handling.py | 45 ++++++++++++++----- pandas/tests/io/pytables/test_subclass.py | 3 -- pandas/tests/io/test_common.py | 3 -- 4 files changed, 36 insertions(+), 17 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2e38303caa354..1293c9a0b8499 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5274,6 +5274,8 @@ def _dtype_to_kind(dtype_str: str) -> str: kind = "integer" elif dtype_str == "object": kind = "object" + elif dtype_str == "str": + kind = "str" else: raise ValueError(f"cannot interpret dtype of [{dtype_str}]") diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 9daf2a5910a08..1878f2a392e13 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -36,12 +36,11 @@ pytestmark = [ pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) -def test_mode(setup_path, tmp_path, mode): +def test_mode(setup_path, tmp_path, mode, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -90,10 +89,12 @@ def test_mode(setup_path, tmp_path, mode): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) + if using_infer_string: + df.columns = df.columns.astype("str") tm.assert_frame_equal(result, df) -def test_default_mode(tmp_path, setup_path): +def test_default_mode(tmp_path, setup_path, using_infer_string): # read_hdf uses default mode df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -103,7 +104,10 @@ def test_default_mode(tmp_path, setup_path): path = tmp_path / setup_path df.to_hdf(path, key="df", mode="w") result = read_hdf(path, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) def test_reopen_handle(tmp_path, setup_path): @@ -162,7 +166,7 @@ def test_reopen_handle(tmp_path, setup_path): assert not store.is_open -def test_open_args(setup_path): +def test_open_args(setup_path, using_infer_string): with tm.ensure_clean(setup_path) as path: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -177,8 +181,13 @@ def test_open_args(setup_path): store["df"] = df store.append("df2", df) - tm.assert_frame_equal(store["df"], df) - tm.assert_frame_equal(store["df2"], df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + + tm.assert_frame_equal(store["df"], expected) + tm.assert_frame_equal(store["df2"], expected) store.close() @@ -193,7 +202,7 @@ def test_flush(setup_path): store.flush(fsync=True) -def test_complibs_default_settings(tmp_path, setup_path): +def test_complibs_default_settings(tmp_path, setup_path, using_infer_string): # GH15943 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -206,7 +215,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complevel=9) result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -217,7 +230,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complib="zlib") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -228,7 +245,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -302,6 +323,7 @@ def test_complibs(tmp_path, lvl, lib, request): assert node.filters.complib == lib +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) @@ -319,6 +341,7 @@ def test_encoding(setup_path): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "val", [ diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index bbe1cd77e0d9f..03622faa2b5a8 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -19,7 +17,6 @@ class TestHDFStoreSubclass: # GH 33748 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_supported_for_subclass_dataframe(self, tmp_path): data = {"a": [1, 2], "b": [3, 4]} sdf = tm.SubclassedDataFrame(data, dtype=np.intp) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index d1e42b297f143..a0dd64f1cb82b 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -379,7 +377,6 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll From 928d2215352564d6e46253a10765956d5a788634 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:28:56 -0800 Subject: [PATCH 351/396] Backport PR #60636 on branch 2.3.x (TST(string dtype): Resolve xfail with apply returning an ndarray) (#60643) Backport PR #60636: TST(string dtype): Resolve xfail with apply returning an ndarray Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/tests/frame/methods/test_dtypes.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 2556c44e63a77..524a5587dce10 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -144,13 +142,9 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - if using_infer_string: - expected = Series([np.array(["bar"])]) - else: - expected = Series(["bar"]) + expected = Series(np.array("bar")) tm.assert_series_equal(result, expected) From e53967b9ad8abb1ba0709cbc37e8d4f4827d6225 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:29:06 -0800 Subject: [PATCH 352/396] Backport PR #60635 on branch 2.3.x (TST(string dtype): Resolve xfail for corrwith) (#60644) Backport PR #60635: TST(string dtype): Resolve xfail for corrwith Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/tests/frame/methods/test_cov_corr.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 721ec4e43eb1b..9abf1996c43e6 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -328,7 +326,6 @@ def test_corrwith(self, datetime_frame, dtype): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_corrwith_with_objects(self, using_infer_string): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -342,9 +339,8 @@ def test_corrwith_with_objects(self, using_infer_string): df2["obj"] = "bar" if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): df1.corrwith(df2) else: with pytest.raises(TypeError, match="Could not convert"): From d90edeb33a40407c6e348222360e0ef69ea4e0c5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 3 Jan 2025 11:22:24 +0100 Subject: [PATCH 353/396] [backport 2.3.x] BUG/TST (string dtype): raise proper TypeError in interpolate (#60637) (#60652) BUG/TST (string dtype): raise proper TypeError in interpolate (#60637) * TST(string dtype): Resolve xfail for interpolate * Adjust arrow tests * Fixup for NumPyExtensionArray * Use tm.shares_memory (cherry picked from commit 5e50d3f3d2b0ee65f0d5bfda0c6da47ffd39dcfe) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/core/arrays/arrow/array.py | 3 +++ pandas/core/arrays/numpy_.py | 3 +++ pandas/tests/extension/test_arrow.py | 11 +++++++++++ pandas/tests/frame/methods/test_interpolate.py | 17 ++++++++++------- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0c1e1d0c63c85..00992ade4f160 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2150,6 +2150,9 @@ def interpolate( See NDFrame.interpolate.__doc__. """ # NB: we return type(self) even if copy=False + if not self.dtype._is_numeric: + raise TypeError(f"Cannot interpolate with {self.dtype} dtype") + mask = self.isna() if self.dtype.kind == "f": data = self._pa_array.to_numpy() diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 9f7238a97d808..07fa6254d87f3 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -287,6 +287,9 @@ def interpolate( See NDFrame.interpolate.__doc__. """ # NB: we return type(self) even if copy=False + if not self.dtype._is_numeric: + raise TypeError(f"Cannot interpolate with {self.dtype} dtype") + if not copy: out_data = self._ndarray else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 470ca0673c60e..d524ed5a16828 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3350,6 +3350,17 @@ def test_string_to_datetime_parsing_cast(): tm.assert_series_equal(result, expected) +@pytest.mark.skipif( + pa_version_under13p0, reason="pairwise_diff_checked not implemented in pyarrow" +) +def test_interpolate_not_numeric(data): + if not data.dtype._is_numeric: + ser = pd.Series(data) + msg = re.escape(f"Cannot interpolate with {ser.dtype} dtype") + with pytest.raises(TypeError, match=msg): + pd.Series(data).interpolate() + + def test_string_to_time_parsing_cast(): # GH 56463 string_times = ["11:41:43.076160"] diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index bbb5e59e4a274..ebee19e3de20a 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -69,11 +69,7 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 - # TODO(infer_string) raise proper TypeError in case of string dtype - @pytest.mark.xfail( - using_string_dtype(), reason="interpolate doesn't work for string" - ) - def test_interp_basic(self, using_copy_on_write): + def test_interp_basic(self, using_copy_on_write, using_infer_string): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -90,6 +86,13 @@ def test_interp_basic(self, using_copy_on_write): "D": list("abcd"), } ) + if using_infer_string: + dtype = "str" if using_infer_string else "object" + msg = f"[Cc]annot interpolate with {dtype} dtype" + with pytest.raises(TypeError, match=msg): + df.interpolate() + return + msg = "DataFrame.interpolate with object dtype" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.interpolate() @@ -111,8 +114,8 @@ def test_interp_basic(self, using_copy_on_write): tm.assert_frame_equal(df, expected) # check we DID operate inplace - assert np.shares_memory(df["C"]._values, cvalues) - assert np.shares_memory(df["D"]._values, dvalues) + assert tm.shares_memory(df["C"]._values, cvalues) + assert tm.shares_memory(df["D"]._values, dvalues) @pytest.mark.xfail( using_string_dtype(), reason="interpolate doesn't work for string" From 7db3ca4b38f8897c536bcd806c61e6578e417d79 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 3 Jan 2025 02:24:55 -0800 Subject: [PATCH 354/396] Backport PR #60454 on branch 2.3.x (String dtype: coerce missing values in indexers for string dtype Index) (#60649) * Backport PR #60454: String dtype: coerce missing values in indexers for string dtype Index * fixup import --------- Co-authored-by: Joris Van den Bossche --- pandas/_libs/index.pyx | 10 +----- pandas/tests/frame/indexing/test_indexing.py | 1 - pandas/tests/indexes/string/test_indexing.py | 33 ++++++++++---------- pandas/tests/reshape/test_pivot.py | 12 +++---- 4 files changed, 22 insertions(+), 34 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 365cc7c3cecfc..8bb839dee436d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -536,23 +536,15 @@ cdef class StringObjectEngine(ObjectEngine): cdef: object na_value - bint uses_na def __init__(self, ndarray values, na_value): super().__init__(values) self.na_value = na_value - self.uses_na = na_value is C_NA - - cdef bint _checknull(self, object val): - if self.uses_na: - return val is C_NA - else: - return util.is_nan(val) cdef _check_type(self, object val): if isinstance(val, str): return val - elif self._checknull(val): + elif checknull(val): return self.na_value else: raise KeyError(val) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index c0ab51a484cdf..a8249ed7f9828 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -517,7 +517,6 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_None(self, float_frame): # GH #766 float_frame[None] = float_frame["A"] diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index d1a278af337b7..648ee47ddc34c 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -13,6 +13,15 @@ def _isnan(val): return False +def _equivalent_na(dtype, null): + if dtype.na_value is pd.NA and null is pd.NA: + return True + elif _isnan(dtype.na_value) and _isnan(null): + return True + else: + return False + + class TestGetLoc: def test_get_loc(self, any_string_dtype): index = Index(["a", "b", "c"], dtype=any_string_dtype) @@ -41,14 +50,7 @@ def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): def test_get_loc_missing(self, any_string_dtype, nulls_fixture): index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) - if any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture)) - ): - with pytest.raises(KeyError): - index.get_loc(nulls_fixture) - else: - assert index.get_loc(nulls_fixture) == 2 + assert index.get_loc(nulls_fixture) == 2 class TestGetIndexer: @@ -93,9 +95,8 @@ def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): result = index.get_indexer(["a", null, "c"]) if using_infer_string: expected = np.array([0, 2, -1], dtype=np.intp) - elif any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and null is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null ): expected = np.array([0, -1, -1], dtype=np.intp) else: @@ -115,9 +116,8 @@ def test_get_indexer_non_unique_nas( if using_infer_string: expected_indexer = np.array([0, 2], dtype=np.intp) expected_missing = np.array([], dtype=np.intp) - elif any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and null is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null ): expected_indexer = np.array([0, -1], dtype=np.intp) expected_missing = np.array([1], dtype=np.intp) @@ -133,9 +133,8 @@ def test_get_indexer_non_unique_nas( if using_infer_string: expected_indexer = np.array([0, 1, 3], dtype=np.intp) - elif any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and null is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null ): pass else: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 75268ccee1d8c..519564a96aa7e 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2619,6 +2619,8 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() # pylint: disable=missing-kwoa + # this still fails because columns=None gets passed down to unstack as level=None + # while at that point None was converted to NaN @pytest.mark.xfail( using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" ) @@ -2637,10 +2639,7 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) - def test_pivot_index_is_none(self): + def test_pivot_index_is_none(self, using_infer_string): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2651,11 +2650,10 @@ def test_pivot_index_is_none(self): result = df.pivot(columns="b", index=None, values="c") expected = DataFrame(3, index=[1], columns=Index([2], name="b")) + if using_infer_string: + expected.index.name = np.nan tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) From 65c1a54894a42ab0b0797b0f22d01d8b93f83d55 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 8 Jan 2025 10:57:56 -0800 Subject: [PATCH 355/396] Backport PR #60679 on branch 2.3.x (Update macos12 runner to macos13) (#60680) --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index c06146b8e67f6..eac63b424ead8 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -94,7 +94,7 @@ jobs: buildplat: - [ubuntu-22.04, manylinux_x86_64] - [ubuntu-22.04, musllinux_x86_64] - - [macos-12, macosx_x86_64] + - [macos-13, macosx_x86_64] # Note: M1 images on Github Actions start from macOS 14 - [macos-14, macosx_arm64] - [windows-2022, win_amd64] From 8ef73d66654eadd140647683eec48e0e719618d4 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Thu, 9 Jan 2025 12:10:55 -0500 Subject: [PATCH 356/396] Backport PR #60683: TST(string dtype): Resolve xfails in test_to_csv (#60687) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/tests/frame/methods/test_astype.py | 2 +- pandas/tests/frame/methods/test_reset_index.py | 2 +- pandas/tests/frame/methods/test_to_csv.py | 14 ++++---------- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index ca3764ac87e95..938f9cfcde3f8 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -760,7 +760,7 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) GH#60639") def test_astype_dt64_to_string( self, frame_or_series, tz_naive_fixture, using_infer_string ): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 8d93c97b6b68a..9e51ac0bc2612 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -646,7 +646,7 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): tm.assert_frame_equal(res, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338") @pytest.mark.parametrize( "array, dtype", [ diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index aca3bb5bccd7c..3b6a54698b5b6 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserError import pandas as pd @@ -422,20 +420,18 @@ def test_to_csv_empty(self): result, expected = self._return_result_expected(df, 1000) tm.assert_frame_equal(result, expected, check_column_type=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.slow def test_to_csv_chunksize(self): chunksize = 1000 rows = chunksize // 2 + 1 df = DataFrame( np.ones((rows, 2)), - columns=Index(list("ab"), dtype=object), + columns=Index(list("ab")), index=MultiIndex.from_arrays([range(rows) for _ in range(2)]), ) result, expected = self._return_result_expected(df, chunksize, rnlvl=2) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] @@ -464,7 +460,7 @@ def test_to_csv_params(self, nrows, df_params, func_params, ncols): for _ in range(df_params["c_idx_nlevels"]) ) else: - columns = Index([f"i-{i}" for i in range(ncols)], dtype=object) + columns = Index([f"i-{i}" for i in range(ncols)]) df = DataFrame(np.ones((nrows, ncols)), index=index, columns=columns) result, expected = self._return_result_expected(df, 1000, **func_params) tm.assert_frame_equal(result, expected, check_names=False) @@ -722,7 +718,6 @@ def test_to_csv_withcommas(self): df2 = self.read_csv(path) tm.assert_frame_equal(df2, df) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_mixed(self): def create_cols(name): return [f"{name}{i:03d}" for i in range(5)] @@ -739,7 +734,7 @@ def create_cols(name): ) df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) df_object = DataFrame( - "foo", index=df_float.index, columns=create_cols("object") + "foo", index=df_float.index, columns=create_cols("object"), dtype="object" ) df_dt = DataFrame( Timestamp("20010101").as_unit("ns"), @@ -812,13 +807,12 @@ def test_to_csv_dups_cols(self): result.columns = df.columns tm.assert_frame_equal(result, df) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_dups_cols2(self): # GH3457 df = DataFrame( np.ones((5, 3)), index=Index([f"i-{i}" for i in range(5)], name="foo"), - columns=Index(["a", "a", "b"], dtype=object), + columns=Index(["a", "a", "b"]), ) with tm.ensure_clean() as filename: From fd30531a404e5d796724efbd85ec04f11356ec57 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 12 Jan 2025 14:46:46 -0800 Subject: [PATCH 357/396] Backport PR #60703 on branch 2.3.x (TST(string dtype): Resolve xfails in stack_unstack) (#60706) Backport PR #60703: TST(string dtype): Resolve xfails in stack_unstack Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/tests/frame/test_stack_unstack.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index af84ee021252f..de470fcda18ed 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.errors import PerformanceWarning @@ -1652,7 +1650,6 @@ def test_unstack_multiple_no_empty_columns(self): expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) @@ -1896,7 +1893,6 @@ def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack): expected = frame.stack(future_stack=future_stack) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) From 07c6cad5e9850e798fb6fe019b46281544c5d834 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 12 Jan 2025 14:46:59 -0800 Subject: [PATCH 358/396] Backport PR #60701 on branch 2.3.x (TST(str dtype): Resolve xfail in test_value_counts.py) (#60707) Backport PR #60701: TST(str dtype): Resolve xfail in test_value_counts.py Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/tests/frame/methods/test_value_counts.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 7670b53f23173..4136d641ef67f 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd import pandas._testing as tm @@ -136,9 +132,6 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) def test_data_frame_value_counts_dropna_false(nulls_fixture): # GH 41334 df = pd.DataFrame( From 564c7b279afd7075ac338bf45169a381215d58e3 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 12 Jan 2025 14:47:27 -0800 Subject: [PATCH 359/396] Backport PR #60700 on branch 2.3.x (TST(string dtype): Resolve xfail in test_find_replace.py) (#60708) Backport PR #60700: TST(string dtype): Resolve xfail in test_find_replace.py Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/tests/strings/test_find_replace.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 48159c07de6ab..dfa9a36995480 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -308,23 +308,12 @@ def test_startswith_endswith_validate_na(request, any_string_dtype): dtype=any_string_dtype, ) - dtype = ser.dtype - if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"): - msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser.str.startswith("kapow", na="baz") - msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser.str.endswith("bar", na="baz") - else: - # TODO(infer_string): don't surface pyarrow errors - import pyarrow as pa - - msg = "Could not convert 'baz' with type str: tried to convert to boolean" - with pytest.raises(pa.lib.ArrowInvalid, match=msg): - ser.str.startswith("kapow", na="baz") - with pytest.raises(pa.lib.ArrowInvalid, match=msg): - ser.str.endswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.startswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.endswith("bar", na="baz") @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") From e90bb0eb1170522a7de1fcec7897d11eaa0719f0 Mon Sep 17 00:00:00 2001 From: Kevin Amparado <109636487+KevsterAmp@users.noreply.github.com> Date: Mon, 13 Jan 2025 16:32:06 +0800 Subject: [PATCH 360/396] DEPR: Raise `FutureWarning` about raising an error in __array__ when copy=False cannot be honored (#60395) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/arrow/array.py | 14 +++++++-- pandas/core/arrays/categorical.py | 11 +++++-- pandas/core/arrays/datetimelike.py | 12 ++++++-- pandas/core/arrays/interval.py | 12 ++++++-- pandas/core/arrays/masked.py | 13 ++++++-- pandas/core/arrays/period.py | 11 +++++-- pandas/core/arrays/sparse/array.py | 11 +++++-- pandas/core/generic.py | 13 ++++++-- pandas/core/indexes/multi.py | 11 +++++-- pandas/tests/arrays/sparse/test_array.py | 4 +-- pandas/tests/base/test_conversion.py | 4 +-- pandas/tests/copy_view/test_array.py | 3 +- pandas/tests/extension/base/interface.py | 30 ++++++++++++++----- .../tests/extension/decimal/test_decimal.py | 20 +++++++++++++ pandas/tests/extension/json/array.py | 14 +++++++-- pandas/tests/indexes/multi/test_conversion.py | 3 +- pandas/tests/io/test_fsspec.py | 5 ---- 18 files changed, 151 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 473d67acf6e74..7ccafbb4cf1df 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -34,7 +34,7 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been - updated to work correctly with NumPy >= 2 (:issue:`57739`) + updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 00992ade4f160..2abec9a8024ef 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -12,6 +12,7 @@ cast, ) import unicodedata +import warnings import numpy as np @@ -28,6 +29,7 @@ pa_version_under13p0, ) from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.cast import ( @@ -663,9 +665,15 @@ def __array__( ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" if copy is False: - # TODO: By using `zero_copy_only` it may be possible to implement this - raise ValueError( - "Unable to avoid copy while creating an array as requested." + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), ) elif copy is None: # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3383f35bb7d55..0fe69f6d1ebc2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1672,8 +1672,15 @@ def __array__( array(['a', 'b'], dtype=object) """ if copy is False: - raise ValueError( - "Unable to avoid copy while creating an array as requested." + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), ) ret = take_nd(self.categories._values, self._codes) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 990116bad13d1..cfe1f3acd9143 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -359,9 +359,17 @@ def __array__( # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): if copy is False: - raise ValueError( - "Unable to avoid copy while creating an array as requested." + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow this " + "behavior starting with pandas 3.0.\nThis conversion to NumPy " + "requires a copy, but 'copy=False' was passed. Consider using " + "'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), ) + return np.array(list(self), dtype=object) if copy is True: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 5aac3d3b28db5..da57e4ceed87e 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -42,6 +42,7 @@ from pandas.compat.numpy import function as nv from pandas.errors import IntCastingNaNError from pandas.util._decorators import Appender +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( LossySetitemError, @@ -1575,8 +1576,15 @@ def __array__( objects (with dtype='object') """ if copy is False: - raise ValueError( - "Unable to avoid copy while creating an array as requested." + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), ) left = self._left diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 0e839dc7a80bb..da656a2768901 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -38,6 +38,7 @@ ) from pandas.errors import AbstractMethodError from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype @@ -604,8 +605,16 @@ def __array__( if not self._hasna: # special case, here we can simply return the underlying data return np.array(self._data, dtype=dtype, copy=copy) - raise ValueError( - "Unable to avoid copy while creating an array as requested." + + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), ) if copy is None: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index aad7737b8dd94..2947ba7b8c72a 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -415,8 +415,15 @@ def __array__( return np.array(self.asi8, dtype=dtype) if copy is False: - raise ValueError( - "Unable to avoid copy while creating an array as requested." + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), ) if dtype == bool: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 13577e366d54b..07ff592f491a8 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -562,8 +562,15 @@ def __array__( return self.sp_values if copy is False: - raise ValueError( - "Unable to avoid copy while creating an array as requested." + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), ) fill_value = self.fill_value diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e55a54112ee72..70b72577dd5d1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2151,9 +2151,16 @@ def __array__( ) -> np.ndarray: if copy is False and not self._mgr.is_single_block and not self.empty: # check this manually, otherwise ._values will already return a copy - # and np.array(values, copy=False) will not raise an error - raise ValueError( - "Unable to avoid copy while creating an array as requested." + # and np.array(values, copy=False) will not raise a warning + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), ) values = self._values if copy is None: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7cb28214c7289..8954d49649a2b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1314,8 +1314,15 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" if copy is False: # self.values is always a newly construct array, so raise. - raise ValueError( - "Unable to avoid copy while creating an array as requested." + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), ) if copy is True: # explicit np.array call to ensure a copy is made and unique objects diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 1e8d36b184e48..b2a570b14df3c 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -500,8 +500,8 @@ def test_array_interface(arr_data, arr): # copy=False semantics are only supported in NumPy>=2. return - # for sparse arrays, copy=False is never allowed - with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed" + with tm.assert_produces_warning(FutureWarning, match=msg): np.array(arr, copy=False) # except when there are actually no sparse filled values diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index a65ab4d287d11..4d0e2d1ce0e07 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -378,8 +378,8 @@ def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): return if not zero_copy: - with pytest.raises(ValueError, match="Unable to avoid copy while creating"): - # An error is always acceptable for `copy=False` + msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed" + with tm.assert_produces_warning(FutureWarning, match=msg): np.array(thing, copy=False) else: diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 06d9424450011..0dabec6014b0d 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -187,7 +187,8 @@ def test_dataframe_multiple_numpy_dtypes(): if np_version_gt2: # copy=False semantics are only supported in NumPy>=2. - with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed" + with pytest.raises(FutureWarning, match=msg): arr = np.array(df, copy=False) arr = np.array(df, copy=True) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 79eb64b5a654f..38cece7da3308 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pytest @@ -82,15 +84,27 @@ def test_array_interface_copy(self, data): # copy=False semantics are only supported in NumPy>=2. return - try: + warning_raised = False + msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed" + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") result_nocopy1 = np.array(data, copy=False) - except ValueError: - # An error is always acceptable for `copy=False` - return - - result_nocopy2 = np.array(data, copy=False) - # If copy=False was given and did not raise, these must share the same data - assert np.may_share_memory(result_nocopy1, result_nocopy2) + assert len(w) <= 1 + if len(w): + warning_raised = True + assert msg in str(w[0].message) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result_nocopy2 = np.array(data, copy=False) + assert len(w) <= 1 + if len(w): + warning_raised = True + assert msg in str(w[0].message) + + if not warning_raised: + # If copy=False was given and did not raise, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 8afb989508e04..8590cd7fdc235 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + import pandas as pd import pandas._testing as tm from pandas.tests.extension import base @@ -289,6 +291,24 @@ def test_series_repr(self, data): def test_unary_ufunc_dunder_equivalence(self, data, ufunc): super().test_unary_ufunc_dunder_equivalence(data, ufunc) + def test_array_interface_copy(self, data): + result_copy1 = np.array(data, copy=True) + result_copy2 = np.array(data, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + try: + result_nocopy1 = np.array(data, copy=False) + except ValueError: + # An error is always acceptable for `copy=False` + return + + result_nocopy2 = np.array(data, copy=False) + # If copy=False was given and did not raise, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + def test_take_na_value_other_decimal(): arr = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("2.0")]) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index b6d72c10712f2..5ff99589a1961 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -25,9 +25,12 @@ TYPE_CHECKING, Any, ) +import warnings import numpy as np +from pandas.util._exceptions import find_stack_level + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( is_bool_dtype, @@ -148,8 +151,15 @@ def __ne__(self, other): def __array__(self, dtype=None, copy=None): if copy is False: - raise ValueError( - "Unable to avoid copy while creating an array as requested." + warnings.warn( + "Starting with NumPy 2.0, the behavior of the 'copy' keyword has " + "changed and passing 'copy=False' raises an error when returning " + "a zero-copy NumPy array is not possible. pandas will follow " + "this behavior starting with pandas 3.0.\nThis conversion to " + "NumPy requires a copy, but 'copy=False' was passed. Consider " + "using 'np.asarray(..)' instead.", + FutureWarning, + stacklevel=find_stack_level(), ) if dtype is None: diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 58a2dc00f937d..d62bd5438a1e3 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -47,7 +47,8 @@ def test_array_interface(idx): return # for MultiIndex, copy=False is never allowed - with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed" + with tm.assert_produces_warning(FutureWarning, match=msg): np.array(idx, copy=False) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index cf59e3e4c4934..e0d652facb8fc 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -5,8 +5,6 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW - from pandas import ( DataFrame, date_range, @@ -170,9 +168,6 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" -@pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet" -) def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") From 7374d09bf49ba459e0e4f666e7f3026452aa744e Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 14 Jan 2025 13:04:04 -0500 Subject: [PATCH 361/396] [backport 2.3.x] TST(string dtype): Resolve xfail when grouping by nan column (#60712) (#60719) TST(string dtype): Resolve xfail when grouping by nan column (#60712) (cherry picked from commit 55a6d0a613897040fec1ae11adc15f5f04728032) --- pandas/tests/groupby/test_groupby.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b5588898d4580..07ddbc36b5ab0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( PerformanceWarning, SpecificationError, @@ -2876,12 +2874,13 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_groupby_none_column_name(): +def test_groupby_none_column_name(using_infer_string): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) - result = df.groupby(by=[None]).sum() - expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None)) + by = [np.nan] if using_infer_string else [None] + gb = df.groupby(by=by) + result = gb.sum() + expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=by[0])) tm.assert_frame_equal(result, expected) From fcc94eb0aca7cb0c90c992d1c36220bde5580000 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 14 Jan 2025 21:39:51 +0100 Subject: [PATCH 362/396] [backport 2.3.x] String dtype: disallow specifying the 'str' dtype with storage in [..] in string alias (#60661) (#60715) (cherry picked from commit 7415aca37159a99f8f99d93a1908070ddf36178c) --- pandas/core/dtypes/dtypes.py | 2 +- pandas/tests/dtypes/test_common.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e7efb8598ec61..542bc85110cad 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2242,7 +2242,7 @@ def construct_from_string(cls, string: str) -> ArrowDtype: ) if not string.endswith("[pyarrow]"): raise TypeError(f"'{string}' must end with '[pyarrow]'") - if string == "string[pyarrow]": + if string in ("string[pyarrow]", "str[pyarrow]"): # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index f7442cf5d6d3c..ceebfb1920594 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -835,3 +835,23 @@ def test_pandas_dtype_string_dtypes(string_storage): with pd.option_context("string_storage", string_storage): result = pandas_dtype("string") assert result == pd.StringDtype(string_storage, na_value=pd.NA) + + +def test_pandas_dtype_string_dtype_alias_with_storage(): + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[python]") + + with pytest.raises(TypeError, match="not understood"): + pandas_dtype("str[pyarrow]") + + result = pandas_dtype("string[python]") + assert result == pd.StringDtype("python", na_value=pd.NA) + + if HAS_PYARROW: + result = pandas_dtype("string[pyarrow]") + assert result == pd.StringDtype("pyarrow", na_value=pd.NA) + else: + with pytest.raises( + ImportError, match="required for PyArrow backed StringArray" + ): + pandas_dtype("string[pyarrow]") From 36d34a1506a4aa5ea01d8b3eb5f5dde526346824 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 21 Jan 2025 12:20:48 -0500 Subject: [PATCH 363/396] Backport PR #60713: TST(string dtype): Resolve xfail in test_base.py (#60742) * Backport PR #60615: TST(string dtype): Resolve some HDF5 xfails * Backport PR #60713: TST(string dtype): Resolve xfail in test_base.py --- pandas/core/arrays/string_.py | 5 +++++ pandas/tests/indexes/test_base.py | 9 +++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e163a9df8ee10..3efb48c86e92c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -531,6 +531,11 @@ def _str_map_nan_semantics( else: return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + def view(self, dtype: Dtype | None = None) -> ArrayLike: + if dtype is not None: + raise TypeError("Cannot change data-type for string array.") + return super().view(dtype=dtype) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e3b8a60354b61..a94e4728a9751 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -354,14 +354,11 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "str" and not index.dtype.storage == "python": - # TODO(infer_string): Make the errors consistent - with pytest.raises(NotImplementedError, match="i8"): - index.view("i8") else: msg = ( - "Cannot change data-type for array of references|" - "Cannot change data-type for object array|" + r"Cannot change data-type for array of references\.|" + r"Cannot change data-type for object array\.|" + r"Cannot change data-type for array of strings\.|" ) with pytest.raises(TypeError, match=msg): index.view("i8") From c638e69db1b31b5387647986d5886fff2066562a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2025 15:30:16 +0100 Subject: [PATCH 364/396] [backport 2.3.x] Update PyArrow conversion and arrow/parquet tests for pyarrow 19.0 (#60716) (#60755) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> (cherry picked from commit 5efac8250787414ec580f0472e2b563032ec7d53) * fixup * don't hardcode object dtype * also enable CoW when enabling future.infer_string --- .github/workflows/unit-tests.yml | 2 + ci/deps/actions-311-pyarrownightly.yaml | 2 +- pandas/compat/__init__.py | 2 + pandas/compat/pyarrow.py | 4 +- pandas/io/_util.py | 10 +++- pandas/tests/arrays/string_/test_string.py | 22 ++++++- pandas/tests/io/test_common.py | 5 +- pandas/tests/io/test_feather.py | 26 ++++++-- pandas/tests/io/test_parquet.py | 70 ++++++++++++++-------- 9 files changed, 103 insertions(+), 40 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 4a2c412d2d98e..210852d0cd809 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -105,6 +105,8 @@ jobs: - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" + pandas_future_infer_string: "1" + pandas_copy_on_write: "1" fail-fast: false name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} env: diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index ba655f9690af6..40b936472d409 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -25,7 +25,7 @@ dependencies: - pip: - "tzdata>=2022.7" - - "--extra-index-url https://pypi.fury.io/arrow-nightlies/" + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--prefer-binary" - "--pre" - "pyarrow" diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5e82853109015..9b6b1ab3b8909 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -34,6 +34,7 @@ pa_version_under16p0, pa_version_under17p0, pa_version_under18p0, + pa_version_under19p0, ) if TYPE_CHECKING: @@ -193,6 +194,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under16p0", "pa_version_under17p0", "pa_version_under18p0", + "pa_version_under19p0", "HAS_PYARROW", "IS64", "ISMUSL", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index f579b8a45d386..81a2d0dc80a10 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -18,6 +18,7 @@ pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") pa_version_under18p0 = _palv < Version("18.0.0") + pa_version_under19p0 = _palv < Version("19.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -29,5 +30,6 @@ pa_version_under15p0 = True pa_version_under16p0 = True pa_version_under17p0 = True - pa_version_under18p0 = False + pa_version_under18p0 = True + pa_version_under19p0 = True HAS_PYARROW = False diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 9373888e28d28..35fdfb1a9ee82 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -10,7 +10,10 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import pa_version_under18p0 +from pandas.compat import ( + pa_version_under18p0, + pa_version_under19p0, +) from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -78,7 +81,10 @@ def arrow_table_to_pandas( elif dtype_backend == "pyarrow": types_mapper = pd.ArrowDtype elif using_string_dtype(): - types_mapper = _arrow_string_types_mapper() + if pa_version_under19p0: + types_mapper = _arrow_string_types_mapper() + else: + types_mapper = None elif dtype_backend is lib.no_default or dtype_backend == "numpy": types_mapper = None else: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 14c02723191a8..c7f854c11f3dd 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -9,7 +9,10 @@ from pandas._config import using_string_dtype -from pandas.compat.pyarrow import pa_version_under12p0 +from pandas.compat.pyarrow import ( + pa_version_under12p0, + pa_version_under19p0, +) from pandas.core.dtypes.common import is_dtype_equal @@ -541,7 +544,7 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage): result = table.to_pandas() - if dtype.na_value is np.nan and not using_string_dtype(): + if dtype.na_value is np.nan and not using_infer_string: assert result["a"].dtype == "object" else: assert isinstance(result["a"].dtype, pd.StringDtype) @@ -555,6 +558,21 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert result.loc[2, "a"] is result["a"].dtype.na_value +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_arrow_from_string(using_infer_string): + # not roundtrip, but starting with pyarrow table without pandas metadata + pa = pytest.importorskip("pyarrow") + table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())}) + + result = table.to_pandas() + + if using_infer_string and not pa_version_under19p0: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str") + else: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object") + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a0dd64f1cb82b..a815ba9c1650a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -19,6 +19,7 @@ import pytest from pandas.compat import is_platform_windows +from pandas.compat.pyarrow import pa_version_under19p0 import pandas.util._test_decorators as td import pandas as pd @@ -166,8 +167,8 @@ def test_get_handle_pyarrow_compat(self): s = StringIO(data) with icom.get_handle(s, "rb", is_text=False) as handles: df = pa_csv.read_csv(handles.handle).to_pandas() - # TODO will have to update this when pyarrow' to_pandas() is fixed - expected = expected.astype("object") + if pa_version_under19p0: + expected = expected.astype("object") tm.assert_frame_equal(df, expected) assert not s.closed diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 58a5f78ce3258..0ab23e3b51a03 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,7 +2,10 @@ import numpy as np import pytest -from pandas.compat.pyarrow import pa_version_under18p0 +from pandas.compat.pyarrow import ( + pa_version_under18p0, + pa_version_under19p0, +) import pandas as pd import pandas._testing as tm @@ -133,8 +136,8 @@ def test_rw_use_threads(self): def test_path_pathlib(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) @@ -142,8 +145,8 @@ def test_path_pathlib(self): def test_path_localpath(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() result = tm.round_trip_localpath(df.to_feather, read_feather) tm.assert_frame_equal(df, result) @@ -241,16 +244,27 @@ def test_invalid_dtype_backend(self): with pytest.raises(ValueError, match=msg): read_feather(path, dtype_backend="numpy") - def test_string_inference(self, tmp_path): + def test_string_inference(self, tmp_path, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) ) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, + dtype=dtype, + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), + ) tm.assert_frame_equal(result, expected) @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 87f9b0108402c..f66ee7dc4367e 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -19,6 +19,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, + pa_version_under19p0, ) import pandas as pd @@ -110,10 +111,7 @@ def fp(request): @pytest.fixture def df_compat(): - # TODO(infer_string) should this give str columns? - return pd.DataFrame( - {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object) - ) + return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"])) @pytest.fixture @@ -261,8 +259,10 @@ def test_invalid_engine(df_compat): check_round_trip(df_compat, "foo", "bar") -def test_options_py(df_compat, pa): +def test_options_py(df_compat, pa, using_infer_string): # use the set option + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") with pd.option_context("io.parquet.engine", "pyarrow"): check_round_trip(df_compat) @@ -798,18 +798,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type): def test_categorical(self, pa): # supported in >= 0.7.0 - df = pd.DataFrame() - df["a"] = pd.Categorical(list("abcdef")) - - # test for null, out-of-order values, and unobserved category - df["b"] = pd.Categorical( - ["bar", "foo", "foo", "bar", None, "bar"], - dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), - ) - - # test for ordered flag - df["c"] = pd.Categorical( - ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True + df = pd.DataFrame( + { + "a": pd.Categorical(list("abcdef")), + # test for null, out-of-order values, and unobserved category + "b": pd.Categorical( + ["bar", "foo", "foo", "bar", None, "bar"], + dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), + ), + # test for ordered flag + "c": pd.Categorical( + ["a", "b", "c", "a", "c", "b"], + categories=["b", "c", "d"], + ordered=True, + ), + } ) check_round_trip(df, pa) @@ -878,11 +881,13 @@ def test_s3_roundtrip_for_dir( repeat=1, ) - def test_read_file_like_obj_support(self, df_compat): + def test_read_file_like_obj_support(self, df_compat, using_infer_string): pytest.importorskip("pyarrow") buffer = BytesIO() df_compat.to_parquet(buffer) df_from_buf = read_parquet(buffer) + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") tm.assert_frame_equal(df_compat, df_from_buf) def test_expand_user(self, df_compat, monkeypatch): @@ -949,7 +954,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - if using_infer_string: + if using_infer_string and pa_version_under19p0: check_round_trip(df, pa, expected=df.astype({"c": "str"})) else: check_round_trip(df, pa) @@ -963,7 +968,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): if using_infer_string: - expected = df.astype("str") + if pa_version_under19p0: + expected = df.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") expected.columns = expected.columns.astype("str") else: expected = df.astype(f"string[{string_storage}]") @@ -1128,17 +1136,24 @@ def test_df_attrs_persistence(self, tmp_path, pa): new_df = read_parquet(path, engine=pa) assert new_df.attrs == df.attrs - def test_string_inference(self, tmp_path, pa): + def test_string_inference(self, tmp_path, pa, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) - df.to_parquet(path, engine="pyarrow") + df.to_parquet(path, engine=pa) with pd.option_context("future.infer_string", True): - result = read_parquet(path, engine="pyarrow") + result = read_parquet(path, engine=pa) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.StringDtype(na_value=np.nan), - index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + dtype=dtype, + index=pd.Index(["a", "b"], dtype=dtype), + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), ) tm.assert_frame_equal(result, expected) @@ -1151,7 +1166,10 @@ def test_roundtrip_decimal(self, tmp_path, pa): df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]") df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) result = read_parquet(path) - expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + if pa_version_under19p0: + expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + else: + expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object") tm.assert_frame_equal(result, expected) def test_infer_string_large_string_type(self, tmp_path, pa): From 1c0c35110f49ffe408148852c3318556ebc04c0a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2025 17:27:47 +0100 Subject: [PATCH 365/396] [backport 2.3.x] ENH: Implement cum* methods for PyArrow strings (#60633) (#60753) (cherry picked from commit b5d4e89d378e69a87b1b9ac7f3d6fa6867840fff) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/conftest.py | 16 +++++++ pandas/core/arrays/arrow/array.py | 55 +++++++++++++++++++++++ pandas/tests/apply/test_str.py | 10 ++++- pandas/tests/extension/base/accumulate.py | 5 ++- pandas/tests/extension/test_arrow.py | 15 ++++--- pandas/tests/extension/test_string.py | 10 +++++ pandas/tests/series/test_cumulative.py | 54 ++++++++++++++++++++++ 8 files changed, 157 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 7ccafbb4cf1df..13f653721a9e9 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -35,8 +35,8 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) -- .. --------------------------------------------------------------------------- .. _whatsnew_230.notable_bug_fixes: diff --git a/pandas/conftest.py b/pandas/conftest.py index 1567708d04b20..35fe5cb475cde 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1273,6 +1273,22 @@ def nullable_string_dtype(request): return request.param +@pytest.fixture( + params=[ + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + ] +) +def pyarrow_string_dtype(request): + """ + Parametrized fixture for string dtypes backed by Pyarrow. + + * 'str[pyarrow]' + * 'string[pyarrow]' + """ + return pd.StringDtype(*request.param) + + @pytest.fixture( params=[ "python", diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2abec9a8024ef..010a0cb608de1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -45,6 +45,7 @@ is_list_like, is_numeric_dtype, is_scalar, + is_string_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna @@ -1617,6 +1618,9 @@ def _accumulate( ------ NotImplementedError : subclass does not define accumulations """ + if is_string_dtype(self): + return self._str_accumulate(name=name, skipna=skipna, **kwargs) + pyarrow_name = { "cummax": "cumulative_max", "cummin": "cumulative_min", @@ -1652,6 +1656,57 @@ def _accumulate( return type(self)(result) + def _str_accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ArrowExtensionArray | ExtensionArray: + """ + Accumulate implementation for strings, see `_accumulate` docstring for details. + + pyarrow.compute does not implement these methods for strings. + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: pa.array | None = None + na_mask: pa.array | None = None + pa_array = self._pa_array + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = pc.is_null(pa_array) + if pc.all(na_mask) == pa.scalar(True): + return type(self)(pa_array) + if skipna: + if name == "cumsum": + pa_array = pc.fill_null(pa_array, "") + else: + # We can retain the running min/max by forward/backward filling. + pa_array = pc.fill_null_forward(pa_array) + pa_array = pc.fill_null_backward(pa_array) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = pc.index(na_mask, True).as_py() + tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) + pa_array = pa_array[:idx] + + # error: Cannot call function of unknown type + pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] + + if tail is not None: + pa_result = pa.concat_arrays([pa_result, tail]) + elif na_mask is not None: + pa_result = pc.if_else(na_mask, None, pa_result) + + result = type(self)(pa_result) + return result + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ Return a pyarrow scalar result of performing the reduction operation. diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index f916567c6b883..9c7836a0aa167 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW + from pandas.core.dtypes.common import is_number from pandas import ( @@ -170,10 +172,14 @@ def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if series.dtype == "string" and func in ("cumsum", np.cumsum, np.nancumsum): + if ( + series.dtype == "string" + and func in ("cumsum", np.cumsum, np.nancumsum) + and not HAS_PYARROW + ): request.applymarker( pytest.mark.xfail( - raises=(TypeError, NotImplementedError), + raises=NotImplementedError, reason="TODO(infer_string) cumsum not yet implemented for string", ) ) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 9a41a3a582c4a..9a2f186c2a00b 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): try: alt = ser.astype("float64") - except TypeError: - # e.g. Period can't be cast to float64 + except (TypeError, ValueError): + # e.g. Period can't be cast to float64 (TypeError) + # String can't be cast to float64 (ValueError) alt = ser.astype(object) result = getattr(ser, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d524ed5a16828..a63cde8022e24 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -388,13 +388,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if ( - pa.types.is_string(pa_type) - or pa.types.is_binary(pa_type) - or pa.types.is_decimal(pa_type) - ): + if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type): if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: return False + elif pa.types.is_string(pa_type): + if op_name == "cumprod": + return False elif pa.types.is_boolean(pa_type): if op_name in ["cumprod", "cummax", "cummin"]: return False @@ -409,6 +408,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request): pa_type = data.dtype.pyarrow_dtype op_name = all_numeric_accumulations + + if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]: + # https://github.com/pandas-dev/pandas/pull/60633 + # Doesn't fit test structure, tested in series/test_cumulative.py instead. + return + ser = pd.Series(data) if not self._supports_accumulation(ser, op_name): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 7997bca5c1c9b..301c7ee851aa0 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -23,6 +23,8 @@ from pandas.compat import HAS_PYARROW +from pandas.core.dtypes.base import StorageExtensionDtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -196,6 +198,14 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: and op_name in ("any", "all") ) + def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + assert isinstance(ser.dtype, StorageExtensionDtype) + return ser.dtype.storage == "pyarrow" and op_name in [ + "cummin", + "cummax", + "cumsum", + ] + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index e6f7b2a5e69e0..0dc391db2182b 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -6,6 +6,8 @@ tests.frame.test_cumulative """ +import re + import numpy as np import pytest @@ -155,3 +157,55 @@ def test_cumprod_timedelta(self): ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() + + @pytest.mark.parametrize( + "data, op, skipna, expected_data", + [ + ([], "cumsum", True, []), + ([], "cumsum", False, []), + (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]), + (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], "cumsum", True, ["x", pd.NA, "xy"]), + (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cumsum", True, [pd.NA, "x", "xy"]), + ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummin", True, []), + ([], "cummin", False, []), + (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), + (["y", "z", "x"], "cummin", False, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", True, ["y", pd.NA, "x"]), + (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]), + ([pd.NA, "y", "x"], "cummin", True, [pd.NA, "y", "x"]), + ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummax", True, []), + ([], "cummax", False, []), + (["x", "z", "y"], "cummax", True, ["x", "z", "z"]), + (["x", "z", "y"], "cummax", False, ["x", "z", "z"]), + (["x", pd.NA, "y"], "cummax", True, ["x", pd.NA, "y"]), + (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cummax", True, [pd.NA, "x", "y"]), + ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", True, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cum_methods_pyarrow_strings( + self, pyarrow_string_dtype, data, op, skipna, expected_data + ): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(data, dtype=pyarrow_string_dtype) + method = getattr(ser, op) + expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + result = method(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + # https://github.com/pandas-dev/pandas/pull/60633 + ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) + msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'") + with pytest.raises(TypeError, match=msg): + ser.cumprod(skipna=skipna) From a23cf0df2c30c3bba5e31956f11010a32949f8cc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2025 17:28:17 +0100 Subject: [PATCH 366/396] [backport 2.3.x] ENH: Enable .mode to sort with NA values (#60702) (#60754) (cherry picked from commit 1708e9020c418e91fae430cf6a7a6ec09c466429) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/core/algorithms.py | 2 +- pandas/tests/frame/test_reductions.py | 17 ++--------------- pandas/tests/reductions/test_reductions.py | 13 +++---------- 3 files changed, 6 insertions(+), 26 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 085a4ee41dcc9..c6084880bea5d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1053,7 +1053,7 @@ def mode( return npresult, res_mask # type: ignore[return-value] try: - npresult = np.sort(npresult) + npresult = safe_sort(npresult) except TypeError as err: warnings.warn( f"Unable to sort modes: {err}", diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 84d56864b3219..1b2e55c978071 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -674,23 +674,10 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self, using_infer_string): - # Check for the warning that is raised when the mode - # results cannot be sorted - + def test_mode_sort_with_na(self, using_infer_string): df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - - # TODO(infer_string) avoid this UserWarning for python storage - warning = ( - None - if using_infer_string and df.A.dtype.storage == "pyarrow" - else UserWarning - ) - with tm.assert_produces_warning(warning, match="Unable to sort modes"): - result = df.mode(dropna=False) - result = result.sort_values(by="A").reset_index(drop=True) - + result = df.mode(dropna=False) tm.assert_frame_equal(result, expected) def test_mode_empty_df(self): diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 496b00a0547b7..7ca1239286188 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1661,17 +1661,10 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(["foo", np.nan], dtype=object) + def test_mode_sort_with_na(self): s = Series([1, "foo", "foo", np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - + expected = Series(["foo", np.nan], dtype=object) + result = s.mode(dropna=False) tm.assert_series_equal(result, expected) def test_mode_boolean_with_na(self): From 00e0603bb7ebfe407017c676463b2eebd3003146 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 22 Jan 2025 10:34:59 -0800 Subject: [PATCH 367/396] Backport PR #60718 on branch 2.3.x (DOC: Whatsnew for sorting mode result) (#60760) Backport PR #60718: DOC: Whatsnew for sorting mode result Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 13f653721a9e9..c999d726be823 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -96,7 +96,7 @@ Timezones Numeric ^^^^^^^ -- +- Enabled :class:`Series.mode` and :class:`DataFrame.mode` with ``dropna=False`` to sort the result for all dtypes in the presence of NA values; previously only certain dtypes would sort (:issue:`60702`) - Conversion From a70b88be76a807a44b4f7194a0e32b28064aace5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2025 22:23:21 +0100 Subject: [PATCH 368/396] [backport 2.3.x] API(str dtype): Raise on StringDtype for unary op + (#60710) (#60763) API(str dtype): Raise on StringDtype for unary op + (#60710) (cherry picked from commit 1bb264c443f6be64ac28ff9afc0341eed0bcc455) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/arrays/string_arrow.py | 4 ++++ pandas/tests/frame/test_unary.py | 6 ------ 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index c999d726be823..5faf73a696e04 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -106,6 +106,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`) - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) - Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c15e50f698a3d..c8aea6f6bab5a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -51,6 +51,7 @@ from pandas._typing import ( ArrayLike, Dtype, + Self, npt, ) @@ -476,6 +477,9 @@ def _cmp_method(self, other, op): return result.to_numpy(np.bool_, na_value=False) return result + def __pos__(self) -> Self: + raise TypeError(f"bad operand type for unary +: '{self.dtype}'") + class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index a76d33e922486..a48b5c51f9ca7 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -3,9 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -120,9 +117,6 @@ def test_pos_object(self, df): tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) @pytest.mark.parametrize( "df", [ From 9b0e866994c18b7acd7243509f37ab4ac319a324 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 23 Jan 2025 17:11:32 +0100 Subject: [PATCH 369/396] [backport 2.3.x] ENH: Enable pytables to round-trip with StringDtype (#60663) (#60771) ENH: Enable pytables to round-trip with StringDtype (#60663) Co-authored-by: William Ayd (cherry picked from commit 60325b86e28edf40cb02444367efbc8deb2b5231) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/io/pytables.py | 36 +++++++++++--- pandas/tests/io/pytables/test_put.py | 70 ++++++++++++++++++++++------ 3 files changed, 87 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 5faf73a696e04..8052f362f233e 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -35,6 +35,7 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`) +- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1293c9a0b8499..1480e0a171147 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -86,12 +86,16 @@ DatetimeArray, PeriodArray, ) +from pandas.core.arrays.string_ import BaseStringArray import pandas.core.common as com from pandas.core.computation.pytables import ( PyTablesExpr, maybe_expression, ) -from pandas.core.construction import extract_array +from pandas.core.construction import ( + array as pd_array, + extract_array, +) from pandas.core.indexes.api import ensure_index from pandas.core.internals import ( ArrayManager, @@ -2955,6 +2959,9 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if isinstance(node, tables.VLArray): ret = node[0][start:stop] + dtype = getattr(attrs, "value_type", None) + if dtype is not None: + ret = pd_array(ret, dtype=dtype) else: dtype = _ensure_decoded(getattr(attrs, "value_type", None)) shape = getattr(attrs, "shape", None) @@ -3193,6 +3200,11 @@ def write_array( elif lib.is_np_dtype(value.dtype, "m"): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" + elif isinstance(value, BaseStringArray): + vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) + vlarr.append(value.to_numpy()) + node = getattr(self.group, key) + node._v_attrs.value_type = str(value.dtype) elif empty_array: self.write_array_empty(key, value) else: @@ -3225,7 +3237,11 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): result = result.astype(StringDtype(na_value=np.nan)) return result @@ -3294,7 +3310,11 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): df = df.astype(StringDtype(na_value=np.nan)) dfs.append(df) @@ -4682,9 +4702,13 @@ def read( df = DataFrame._from_arrays([values], columns=cols_, index=index_) if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_string_dtype() and is_string_array( - values, # type: ignore[arg-type] - skipna=True, + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array( + values, + skipna=True, + ) ): df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index f84a3ebfeb54a..38f0379eb9a66 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp import pandas as pd @@ -26,7 +24,6 @@ pytestmark = [ pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @@ -54,8 +51,8 @@ def test_api_default_format(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -79,8 +76,8 @@ def test_api_default_format(tmp_path, setup_path): path = tmp_path / setup_path df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with pd.option_context("io.hdf.default_format", "fixed"): @@ -106,7 +103,7 @@ def test_put(setup_path): ) df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) store["a"] = ts @@ -166,7 +163,7 @@ def test_put_compression(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -183,7 +180,7 @@ def test_put_compression(setup_path): def test_put_compression_blosc(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -197,10 +194,20 @@ def test_put_compression_blosc(setup_path): tm.assert_frame_equal(store["c"], df) -def test_put_mixed_type(setup_path): +def test_put_datetime_ser(setup_path): + # https://github.com/pandas-dev/pandas/pull/60663 + ser = Series(3 * [Timestamp("20010102").as_unit("ns")]) + with ensure_clean_store(setup_path) as store: + store.put("ser", ser) + expected = ser.copy() + result = store.get("ser") + tm.assert_series_equal(result, expected) + + +def test_put_mixed_type(setup_path, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["obj1"] = "foo" @@ -220,13 +227,42 @@ def test_put_mixed_type(setup_path): with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") - with tm.assert_produces_warning(pd.errors.PerformanceWarning): + warning = None if using_infer_string else pd.errors.PerformanceWarning + with tm.assert_produces_warning(warning): store.put("df", df) expected = store.get("df") tm.assert_frame_equal(expected, df) +def test_put_str_frame(setup_path, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + df = DataFrame({"a": pd.array(["x", pd.NA, "y"], dtype=dtype)}) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("df", df) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = df.astype(expected_dtype) + result = store.get("df") + tm.assert_frame_equal(result, expected) + + +def test_put_str_series(setup_path, string_dtype_arguments): + # https://github.com/pandas-dev/pandas/pull/60663 + dtype = pd.StringDtype(*string_dtype_arguments) + ser = Series(["x", pd.NA, "y"], dtype=dtype) + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + store.put("ser", ser) + expected_dtype = "str" if dtype.na_value is np.nan else "string" + expected = ser.astype(expected_dtype) + result = store.get("ser") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("format", ["table", "fixed"]) @pytest.mark.parametrize( "index", @@ -253,7 +289,7 @@ def test_store_index_types(setup_path, format, index): tm.assert_frame_equal(df, store["df"]) -def test_column_multiindex(setup_path): +def test_column_multiindex(setup_path, using_infer_string): # GH 4710 # recreate multi-indexes properly @@ -264,6 +300,12 @@ def test_column_multiindex(setup_path): expected = df.set_axis(df.index.to_numpy()) with ensure_clean_store(setup_path) as store: + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + store.put("df", df) + return store.put("df", df) tm.assert_frame_equal( store["df"], expected, check_index_type=True, check_column_type=True From 0b2cc22bb118017d05a5cacd9cc365fb0eaed792 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 24 Jan 2025 22:42:32 +0100 Subject: [PATCH 370/396] [backport 2.3.x] TST(string dtype): Fix xfails in test_block_internals.py (#60765) (#60781) TST(string dtype): Fix xfails in test_block_internals.py (#60765) (cherry picked from commit d38706af66249ef74e42671a480261c68bedfbce) Co-authored-by: William Ayd --- pandas/tests/frame/conftest.py | 2 +- .../frame/constructors/test_from_dict.py | 1 - pandas/tests/frame/test_block_internals.py | 35 ++++++------------- 3 files changed, 11 insertions(+), 27 deletions(-) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 45b5d9b4aa698..b7293946d38c9 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -33,7 +33,7 @@ def float_string_frame(): df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), index=Index([f"foo_{i}" for i in range(30)], dtype=object), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), ) df["foo"] = "bar" return df diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index fc7c03dc25839..1509c47ba65c7 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -108,7 +108,6 @@ def test_constructor_list_of_series(self): expected = DataFrame.from_dict(sdict, orient="index") tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series recons = DataFrame.from_dict(data_dict, orient="index") diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 0766e927a64a9..b2fcba50de097 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td @@ -185,21 +183,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_mixed(self, float_string_frame, using_infer_string): - # test construction edge cases with mixed types - - # f7u12, this does not work without extensive workaround - data = [ - [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], - [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], - ] - df = DataFrame(data) - - # check dtypes - result = df.dtypes - expected = Series({"datetime64[us]": 3}) - # mixed-type frames float_string_frame["datetime"] = datetime.now() float_string_frame["timedelta"] = timedelta(days=1, seconds=1) @@ -219,13 +203,11 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64; as of 2.0 this does # *not* convert arr = np.array([1, 2, 3], dtype="timedelta64[s]") - df = DataFrame(index=range(3)) - df["A"] = arr + df = DataFrame({"A": arr}) expected = DataFrame( {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3) ) @@ -243,11 +225,11 @@ def test_construction_with_conversions(self): assert expected.dtypes["dt1"] == "M8[s]" assert expected.dtypes["dt2"] == "M8[s]" - df = DataFrame(index=range(3)) - df["dt1"] = np.datetime64("2013-01-01") - df["dt2"] = np.array( + dt1 = np.datetime64("2013-01-01") + dt2 = np.array( ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]" ) + df = DataFrame({"dt1": dt1, "dt2": dt2}) # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]') @@ -440,14 +422,17 @@ def test_update_inplace_sets_valid_block_values(using_copy_on_write): assert df.isnull().sum().sum() == 0 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nonconsolidated_item_cache_take(): # https://github.com/pandas-dev/pandas/issues/35521 # create non-consolidated dataframe with object dtype columns - df = DataFrame() - df["col1"] = Series(["a"], dtype=object) + df = DataFrame( + { + "col1": Series(["a"], dtype=object), + } + ) df["col2"] = Series([0], dtype=object) + assert not df._mgr.is_consolidated() # access column (item cache) df["col1"] == "A" From c0983977edcce05a279b8421d12e3ae9a1fb6f6f Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 24 Jan 2025 16:43:05 -0500 Subject: [PATCH 371/396] =?UTF-8?q?Backport=20PR=20#60711:=20TST(string=20?= =?UTF-8?q?dtype):=20Resolve=20xfail=20in=20groupby.test=5F=E2=80=A6=20(#6?= =?UTF-8?q?0782)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #60711: TST(string dtype): Resolve xfail in groupby.test_size Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/tests/groupby/methods/test_size.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 271802c447024..4e92fb22f840a 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_integer_dtype from pandas import ( @@ -108,18 +106,16 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) -# TODO(infer_string) in case the column is object dtype, it should preserve that dtype -# for the result's index -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_size_strings(any_string_dtype): +def test_size_strings(any_string_dtype, using_infer_string): # GH#55627 dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" + exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype expected = Series( [2, 1], - index=Index(["a", "b"], name="a", dtype=dtype), + index=Index(["a", "b"], name="a", dtype=exp_index_dtype), name="b", dtype=exp_dtype, ) From f8196e46f2377ae91ca49638e4a38984b401de90 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 24 Jan 2025 23:09:18 +0100 Subject: [PATCH 372/396] [backport 2.3.x] CI: Test Github Actions Arm64 Runners (#60722) (#60780) (cherry picked from commit b98336653128790661d4c66d398f3e44d481dd3b) Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 23 ++++++++++++++++++++--- .github/workflows/wheels.yml | 1 + 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 210852d0cd809..7682989b5ee63 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -22,10 +22,11 @@ defaults: jobs: ubuntu: - runs-on: ubuntu-22.04 + runs-on: ${{ matrix.platform }} timeout-minutes: 90 strategy: matrix: + platform: [ubuntu-22.04, ubuntu-24.04-arm] env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] @@ -35,9 +36,11 @@ jobs: env_file: actions-311-downstream_compat.yaml pattern: "not slow and not network and not single_cpu" pytest_target: "pandas/tests/test_downstream.py" + platform: ubuntu-22.04 - name: "Minimum Versions" env_file: actions-39-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" + platform: ubuntu-22.04 - name: "Locale: it_IT" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -48,6 +51,7 @@ jobs: # Also install it_IT (its encoding is ISO8859-1) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "it_IT" + platform: ubuntu-22.04 - name: "Locale: zh_CN" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -58,57 +62,70 @@ jobs: # Also install zh_CN (its encoding is gb2312) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.9" env_file: actions-39.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.10" env_file: actions-310.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.11" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.12" env_file: actions-312.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.11 (warnings)" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.10 (warnings)" env_file: actions-310.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" + platform: ubuntu-22.04 - name: "Copy-on-Write 3.9 (warnings)" env_file: actions-39.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" + platform: ubuntu-22.04 - name: "Future infer strings" env_file: actions-312.yaml pandas_future_infer_string: "1" pandas_copy_on_write: "1" + platform: ubuntu-22.04 - name: "Future infer strings (without pyarrow)" env_file: actions-311.yaml pandas_future_infer_string: "1" pandas_copy_on_write: "1" + platform: ubuntu-22.04 - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" test_args: "--max-worker-restart 0" + platform: ubuntu-22.04 - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" test_args: "-W error::DeprecationWarning -W error::FutureWarning" + platform: ubuntu-22.04 - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" pandas_future_infer_string: "1" pandas_copy_on_write: "1" + platform: ubuntu-22.04 fail-fast: false - name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} + name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}-${{ matrix.platform }} env: PATTERN: ${{ matrix.pattern }} LANG: ${{ matrix.lang || 'C.UTF-8' }} @@ -124,7 +141,7 @@ jobs: REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }}-${{ matrix.pandas_future_infer_string }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }}-${{ matrix.pandas_future_infer_string }}-${{ matrix.platform }} cancel-in-progress: true services: diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index eac63b424ead8..3a0f73ab22ad2 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -94,6 +94,7 @@ jobs: buildplat: - [ubuntu-22.04, manylinux_x86_64] - [ubuntu-22.04, musllinux_x86_64] + - [ubuntu-24.04-arm, manylinux_aarch64] - [macos-13, macosx_x86_64] # Note: M1 images on Github Actions start from macOS 14 - [macos-14, macosx_arm64] From 6bb498b889694018e47b0ab88edc6a9e0bea9a70 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 24 Jan 2025 19:15:21 -0800 Subject: [PATCH 373/396] Backport PR #60784 on branch 2.3.x (TST(string_dtype): Fix minor issue with CSV parser and column dtype) (#60785) Backport PR #60784: TST(string_dtype): Fix minor issue with CSV parser and column dtype Co-authored-by: William Ayd --- pandas/io/parsers/arrow_parser_wrapper.py | 3 ++- pandas/tests/io/parser/common/test_index.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index a7f01e6322755..7fe5ecb0e54c2 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -165,7 +165,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. - self.names = list(range(num_cols - len(self.names))) + self.names + columns_prefix = [str(x) for x in range(num_cols - len(self.names))] + self.names = columns_prefix + self.names multi_index_named = False frame.columns = self.names # we only need the frame not the names diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index cdd65223a9c9f..aaa14216bd6d6 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -89,9 +89,6 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): def test_multi_index_no_level_names( request, all_parsers, index_col, using_infer_string ): - if using_infer_string and all_parsers.engine == "pyarrow": - # result should have string columns instead of object dtype - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 From 6451f79e0fe67816066c02dcaed8812dc4f57b79 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2025 10:52:45 +0100 Subject: [PATCH 374/396] [backport 2.3.x] CI: Remove CircleCI in favor of GHA ARM builds (#60761) (#60790) CI: Remove CircleCI in favor of GHA ARM builds (#60761) (cherry picked from commit f3045db91dbb89306c15b1673987cc70912a76b5) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .circleci/config.yml | 143 ---------------------------------- .gitattributes | 1 - ci/deps/circle-310-arm64.yaml | 62 --------------- pandas/tests/io/conftest.py | 7 +- 4 files changed, 3 insertions(+), 210 deletions(-) delete mode 100644 .circleci/config.yml delete mode 100644 ci/deps/circle-310-arm64.yaml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 2c52d7aee4e28..0000000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,143 +0,0 @@ -version: 2.1 - -jobs: - test-arm: - machine: - image: default - resource_class: arm.large - environment: - ENV_FILE: ci/deps/circle-310-arm64.yaml - PYTEST_WORKERS: auto - PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" - PYTEST_TARGET: "pandas" - PANDAS_CI: "1" - steps: - - checkout - - run: .circleci/setup_env.sh - - run: | - PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH \ - LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD \ - ci/run_tests.sh - linux-musl: - docker: - - image: quay.io/pypa/musllinux_1_1_aarch64 - resource_class: arm.large - steps: - # Install pkgs first to have git in the image - # (needed for checkout) - - run: | - apk update - apk add git - apk add musl-locales - - checkout - - run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev - . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil "pytz<2024.2" pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" - python -m pip list --no-cache-dir - - run: | - . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml - build-aarch64: - parameters: - cibw-build: - type: string - machine: - image: default - resource_class: arm.large - environment: - TRIGGER_SOURCE: << pipeline.trigger_source >> - steps: - - checkout - - run: - name: Check if build is necessary - command: | - # Check if tag is defined or TRIGGER_SOURCE is scheduled - if [[ -n "$CIRCLE_TAG" ]]; then - echo 'export IS_PUSH="true"' >> "$BASH_ENV" - elif [[ $TRIGGER_SOURCE == "scheduled_pipeline" ]]; then - echo 'export IS_SCHEDULE_DISPATCH="true"' >> "$BASH_ENV" - # Look for the build label/[wheel build] in commit - # grep takes a regex, so need to escape brackets - elif (git log --format=oneline -n 1 $CIRCLE_SHA1) | grep -q '\[wheel build\]'; then - : # Do nothing - elif ! (curl https://api.github.com/repos/pandas-dev/pandas/issues/$CIRCLE_PR_NUMBER | jq '.labels' | grep -q 'Build'); then - circleci-agent step halt - fi - - run: - name: Build aarch64 wheels - no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that - command: | - pip3 install cibuildwheel==2.20.0 - if [[ $CIBW_BUILD == cp313t* ]]; then - # TODO: temporarily run 3.13 free threaded builds without build isolation - # since we need pre-release cython - CIBW_BUILD_FRONTEND="pip; args: --no-build-isolation" cibuildwheel --prerelease-pythons --output-dir wheelhouse - else - cibuildwheel --prerelease-pythons --output-dir wheelhouse - fi - - environment: - CIBW_BUILD: << parameters.cibw-build >> - - - run: - name: Install Anaconda Client & Upload Wheels - command: | - echo "Install Mambaforge" - MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" - echo "Downloading $MAMBA_URL" - wget -q $MAMBA_URL -O minimamba.sh - chmod +x minimamba.sh - - MAMBA_DIR="$HOME/miniconda3" - rm -rf $MAMBA_DIR - ./minimamba.sh -b -p $MAMBA_DIR - - export PATH=$MAMBA_DIR/bin:$PATH - - mamba install -y -c conda-forge anaconda-client - - source ci/upload_wheels.sh - set_upload_vars - upload_wheels - - store_artifacts: - path: wheelhouse/ - -workflows: - test: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - test-arm - test-musl: - # Don't run trigger this one when scheduled pipeline runs - when: - not: - equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] - jobs: - - linux-musl - build-wheels: - jobs: - - build-aarch64: - filters: - tags: - only: /^v.*/ - matrix: - parameters: - cibw-build: ["cp39-manylinux_aarch64", - "cp310-manylinux_aarch64", - "cp311-manylinux_aarch64", - "cp312-manylinux_aarch64", - "cp313-manylinux_aarch64", - "cp313t-manylinux_aarch64", - "cp39-musllinux_aarch64", - "cp310-musllinux_aarch64", - "cp311-musllinux_aarch64", - "cp312-musllinux_aarch64", - "cp313-musllinux_aarch64", - "cp313t-musllinux_aarch64"] diff --git a/.gitattributes b/.gitattributes index 2655d0d018d4f..bc7dec642df0f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -61,7 +61,6 @@ pandas/_version.py export-subst *.pxi export-ignore # Ignoring stuff from the top level -.circleci export-ignore .github export-ignore asv_bench export-ignore ci export-ignore diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml deleted file mode 100644 index eeb1cb48b1018..0000000000000 --- a/ci/deps/circle-310-arm64.yaml +++ /dev/null @@ -1,62 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.10 - - # build dependencies - - versioneer - - cython>=0.29.33 - - meson=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy - # pytz 2024.2 timezones cause wrong results - - pytz < 2024.2 - - # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 - - bottleneck>=1.3.6 - - fastparquet>=2022.12.0 - - fsspec>=2022.11.0 - - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 - - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - odfpy>=1.4.1 - - qtpy>=2.3.0 - - openpyxl>=3.1.0 - - psycopg2>=2.9.6 - - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 - - pytables>=3.8.0 - - python-calamine>=0.1.7 - - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 - - sqlalchemy>=2.0.0 - - tabulate>=0.9.0 - - xarray>=2022.12.0, <2024.10.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 - - pip: - - adbc-driver-postgresql>=0.8.0 - - adbc-driver-sqlite>=0.8.0 diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index bdefadf3dbec0..a5ddda9d66e7a 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -67,14 +67,13 @@ def s3_base(worker_id, monkeypatch): monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") if is_ci_environment(): if is_platform_arm() or is_platform_mac() or is_platform_windows(): - # NOT RUN on Windows/macOS/ARM, only Ubuntu + # NOT RUN on Windows/macOS, only Ubuntu # - subprocess in CI can cause timeouts # - GitHub Actions do not support # container services for the above OSs - # - CircleCI will probably hit the Docker rate pull limit pytest.skip( - "S3 tests do not have a corresponding service in " - "Windows, macOS or ARM platforms" + "S3 tests do not have a corresponding service on " + "Windows or macOS platforms" ) else: # set in .github/workflows/unit-tests.yml From bb41546a60c8443503e39b697a63d36a7a43823c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2025 14:50:38 +0100 Subject: [PATCH 375/396] [backport 2.3.x] BUG: fix construction of Series / Index from dict keys when "str" dtype is specified explicitly (#60436) (#60793) BUG: fix construction of Series / Index from dict keys when "str" dtype is specified explicitly (#60436) Co-authored-by: Joris Van den Bossche (cherry picked from commit 84bf1ef82912ebf497a304b0ffd90914bfc41ea9) Co-authored-by: tasfia8 <117693390+tasfia8@users.noreply.github.com> --- pandas/core/construction.py | 2 ++ pandas/tests/base/test_constructors.py | 11 +++++++++++ pandas/tests/io/test_fsspec.py | 1 - 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 584a1d417d198..59e87f28a3dce 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -589,6 +589,8 @@ def sanitize_array( # create an extension array from its dtype _sanitize_non_ordered(data) cls = dtype.construct_array_type() + if not hasattr(data, "__array__"): + data = list(data) subarr = cls._from_sequence(data, dtype=dtype, copy=copy) # GH#846 diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index f3ac60f672ee1..3434c8110a79c 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -177,3 +177,14 @@ def test_constructor_datetime_nonns(self, constructor): arr.flags.writeable = False result = constructor(arr) tm.assert_equal(result, expected) + + def test_constructor_from_dict_keys(self, constructor, using_infer_string): + # https://github.com/pandas-dev/pandas/issues/60343 + d = {"a": 1, "b": 2} + result = constructor(d.keys(), dtype="str") + if using_infer_string: + assert result.dtype == "str" + else: + assert result.dtype == "object" + expected = constructor(list(d.keys()), dtype="str") + tm.assert_equal(result, expected) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index e0d652facb8fc..dde85f9f8409d 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -197,7 +197,6 @@ def test_arrowparquet_options(fsspectest): @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") From 2e617d36af3592a371fe09a1aec8282f9db550da Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2025 19:08:31 +0100 Subject: [PATCH 376/396] [2.3.x] DEPS: Drop Python 3.9 (#60792) --- .github/workflows/package-checks.yml | 2 +- .github/workflows/unit-tests.yml | 16 +---- .github/workflows/wheels.yml | 2 +- ...yaml => actions-310-minimum_versions.yaml} | 2 +- ci/deps/actions-39.yaml | 64 ------------------- doc/source/whatsnew/v2.3.0.rst | 6 ++ 6 files changed, 12 insertions(+), 80 deletions(-) rename ci/deps/{actions-39-minimum_versions.yaml => actions-310-minimum_versions.yaml} (98%) delete mode 100644 ci/deps/actions-39.yaml diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index e1be5659bbd9a..485a890e26abd 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -53,7 +53,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - python-version: ['3.9', '3.10', '3.11'] + python-version: ['3.10', '3.11'] fail-fast: false name: Test Conda Forge Recipe - Python ${{ matrix.python-version }} concurrency: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 7682989b5ee63..091b8eb48ac3b 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -27,7 +27,7 @@ jobs: strategy: matrix: platform: [ubuntu-22.04, ubuntu-24.04-arm] - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] pandas_future_infer_string: ["0"] @@ -38,7 +38,7 @@ jobs: pytest_target: "pandas/tests/test_downstream.py" platform: ubuntu-22.04 - name: "Minimum Versions" - env_file: actions-39-minimum_versions.yaml + env_file: actions-310-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" platform: ubuntu-22.04 - name: "Locale: it_IT" @@ -63,11 +63,6 @@ jobs: # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" platform: ubuntu-22.04 - - name: "Copy-on-Write 3.9" - env_file: actions-39.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "1" - platform: ubuntu-22.04 - name: "Copy-on-Write 3.10" env_file: actions-310.yaml pattern: "not slow and not network and not single_cpu" @@ -93,11 +88,6 @@ jobs: pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" platform: ubuntu-22.04 - - name: "Copy-on-Write 3.9 (warnings)" - env_file: actions-39.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "warn" - platform: ubuntu-22.04 - name: "Future infer strings" env_file: actions-312.yaml pandas_future_infer_string: "1" @@ -228,7 +218,7 @@ jobs: matrix: # Note: Don't use macOS latest since macos 14 appears to be arm64 only os: [macos-13, macos-14, windows-latest] - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] fail-fast: false runs-on: ${{ matrix.os }} name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 3a0f73ab22ad2..e5d13307973e0 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -100,7 +100,7 @@ jobs: - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] + python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] include: # TODO: Remove this plus installing build deps in cibw_before_build.sh # after pandas can be built with a released NumPy/Cython diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml similarity index 98% rename from ci/deps/actions-39-minimum_versions.yaml rename to ci/deps/actions-310-minimum_versions.yaml index 6e38a7c5f0774..cac0814acfbd5 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -4,7 +4,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.9 + - python=3.10 # build dependencies - versioneer diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml deleted file mode 100644 index 4320e9060fb4a..0000000000000 --- a/ci/deps/actions-39.yaml +++ /dev/null @@ -1,64 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.9 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - meson[ninja]=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy - # pytz 2024.2 timezones cause wrong results - - pytz<2024.2 - - # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 - - bottleneck>=1.3.6 - - fastparquet>=2022.12.0 - - fsspec>=2022.11.0 - - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 - - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - odfpy>=1.4.1 - - qtpy>=2.3.0 - - openpyxl>=3.1.0 - - psycopg2>=2.9.6 - - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 - - pytables>=3.8.0 - - python-calamine>=0.1.7 - - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 - - sqlalchemy>=2.0.0 - - tabulate>=0.9.0 - - xarray>=2022.12.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 - - - pip: - - adbc-driver-postgresql>=0.8.0 - - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 - - pytest-localserver>=0.7.1 diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 8052f362f233e..3ee54bcc1c6e2 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -52,6 +52,12 @@ These are bug fixes that might have notable behavior changes. notable_bug_fix1 ^^^^^^^^^^^^^^^^ + +Increased minimum version for Python +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +pandas 2.3.0 supports Python 3.10 and higher. + .. --------------------------------------------------------------------------- .. _whatsnew_230.deprecations: From 1e487982ff7501f07e2bba7a7d924fb92b3d5c7f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 26 Jan 2025 13:22:22 -0800 Subject: [PATCH 377/396] Backport PR #59906 on branch 2.3.x (BLD: Fix armv7 build) (#59937) Backport PR #59906: BLD: Fix armv7 build Co-authored-by: Marc Mueller <30130371+cdce8p@users.noreply.github.com> Co-authored-by: Joris Van den Bossche --- pandas/_libs/src/vendored/ujson/python/JSONtoObj.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c index 7cc20a52f1849..4cfead8ac77a5 100644 --- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c @@ -38,9 +38,11 @@ Numeric decoder derived from TCL library // Licence at LICENSES/ULTRAJSON_LICENSE -#include "pandas/vendored/ujson/lib/ultrajson.h" +// clang-format off #define PY_SSIZE_T_CLEAN #include +#include "pandas/vendored/ujson/lib/ultrajson.h" +// clang-format on static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name, JSOBJ value) { From 04c3e817700b9cc94d85b46f4219e4116b694895 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 28 Jan 2025 10:19:14 -0800 Subject: [PATCH 378/396] Backport PR #60796 on branch 2.3.x (BUG: is_*_array returns true on empty object dtype) (#60808) Backport PR #60796: BUG: is_*_array returns true on empty object dtype Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/_libs/lib.pyx | 36 +++++++++++++-------------- pandas/tests/dtypes/test_inference.py | 25 +++++++++++++++++++ 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f72d6a5dad877..87cbadaa811f7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1846,7 +1846,7 @@ cdef class BoolValidator(Validator): cpdef bint is_bool_array(ndarray values, bint skipna=False): cdef: - BoolValidator validator = BoolValidator(len(values), + BoolValidator validator = BoolValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1864,7 +1864,7 @@ cdef class IntegerValidator(Validator): # Note: only python-exposed for tests cpdef bint is_integer_array(ndarray values, bint skipna=True): cdef: - IntegerValidator validator = IntegerValidator(len(values), + IntegerValidator validator = IntegerValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1879,7 +1879,7 @@ cdef class IntegerNaValidator(Validator): cdef bint is_integer_na_array(ndarray values, bint skipna=True): cdef: - IntegerNaValidator validator = IntegerNaValidator(len(values), + IntegerNaValidator validator = IntegerNaValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1895,7 +1895,7 @@ cdef class IntegerFloatValidator(Validator): cdef bint is_integer_float_array(ndarray values, bint skipna=True): cdef: - IntegerFloatValidator validator = IntegerFloatValidator(len(values), + IntegerFloatValidator validator = IntegerFloatValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1913,7 +1913,7 @@ cdef class FloatValidator(Validator): # Note: only python-exposed for tests cpdef bint is_float_array(ndarray values): cdef: - FloatValidator validator = FloatValidator(len(values), values.dtype) + FloatValidator validator = FloatValidator(values.size, values.dtype) return validator.validate(values) @@ -1931,7 +1931,7 @@ cdef class ComplexValidator(Validator): cdef bint is_complex_array(ndarray values): cdef: - ComplexValidator validator = ComplexValidator(len(values), values.dtype) + ComplexValidator validator = ComplexValidator(values.size, values.dtype) return validator.validate(values) @@ -1944,7 +1944,7 @@ cdef class DecimalValidator(Validator): cdef bint is_decimal_array(ndarray values, bint skipna=False): cdef: DecimalValidator validator = DecimalValidator( - len(values), values.dtype, skipna=skipna + values.size, values.dtype, skipna=skipna ) return validator.validate(values) @@ -1960,7 +1960,7 @@ cdef class StringValidator(Validator): cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: - StringValidator validator = StringValidator(len(values), + StringValidator validator = StringValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -1977,7 +1977,7 @@ cdef class BytesValidator(Validator): cdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: - BytesValidator validator = BytesValidator(len(values), values.dtype, + BytesValidator validator = BytesValidator(values.size, values.dtype, skipna=skipna) return validator.validate(values) @@ -2028,7 +2028,7 @@ cdef class DatetimeValidator(TemporalValidator): cpdef bint is_datetime_array(ndarray values, bint skipna=True): cdef: - DatetimeValidator validator = DatetimeValidator(len(values), + DatetimeValidator validator = DatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2042,7 +2042,7 @@ cdef class Datetime64Validator(DatetimeValidator): # Note: only python-exposed for tests cpdef bint is_datetime64_array(ndarray values, bint skipna=True): cdef: - Datetime64Validator validator = Datetime64Validator(len(values), + Datetime64Validator validator = Datetime64Validator(values.size, skipna=skipna) return validator.validate(values) @@ -2057,7 +2057,7 @@ cdef class AnyDatetimeValidator(DatetimeValidator): cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True): cdef: - AnyDatetimeValidator validator = AnyDatetimeValidator(len(values), + AnyDatetimeValidator validator = AnyDatetimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2069,7 +2069,7 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: Doesn't check values are datetime-like types. """ cdef: - Py_ssize_t i = 0, j, n = len(values) + Py_ssize_t i = 0, j, n = values.size object base_val, base_tz, val, tz if n == 0: @@ -2117,7 +2117,7 @@ cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True): Infer with timedeltas and/or nat/none. """ cdef: - AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values), + AnyTimedeltaValidator validator = AnyTimedeltaValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2131,7 +2131,7 @@ cdef class DateValidator(Validator): # Note: only python-exposed for tests cpdef bint is_date_array(ndarray values, bint skipna=False): cdef: - DateValidator validator = DateValidator(len(values), skipna=skipna) + DateValidator validator = DateValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2144,7 +2144,7 @@ cdef class TimeValidator(Validator): # Note: only python-exposed for tests cpdef bint is_time_array(ndarray values, bint skipna=False): cdef: - TimeValidator validator = TimeValidator(len(values), skipna=skipna) + TimeValidator validator = TimeValidator(values.size, skipna=skipna) return validator.validate(values) @@ -2195,14 +2195,14 @@ cpdef bint is_interval_array(ndarray values): Is this an ndarray of Interval (or np.nan) with a single dtype? """ cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, n = values.size str closed = None bint numeric = False bint dt64 = False bint td64 = False object val - if len(values) == 0: + if n == 0: return False for i in range(n): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 0567be737c681..79b7e6ff092b6 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1585,6 +1585,31 @@ def test_is_string_array(self): ) assert not lib.is_string_array(np.array([1, 2])) + @pytest.mark.parametrize( + "func", + [ + "is_bool_array", + "is_date_array", + "is_datetime_array", + "is_datetime64_array", + "is_float_array", + "is_integer_array", + "is_interval_array", + "is_string_array", + "is_time_array", + "is_timedelta_or_timedelta64_array", + ], + ) + def test_is_dtype_array_empty_obj(self, func): + # https://github.com/pandas-dev/pandas/pull/60796 + func = getattr(lib, func) + + arr = np.empty((2, 0), dtype=object) + assert not func(arr) + + arr = np.empty((0, 2), dtype=object) + assert not func(arr) + def test_to_object_array_tuples(self): r = (5, 6) values = [r] From 97a06de352d9ab7797b0ff449375a1491ef6a307 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 5 Feb 2025 05:54:49 -0500 Subject: [PATCH 379/396] Backport PR #60709: ENH(string dtype): Make str.decode return str dtype (#60821) --- .github/actions/run-tests/action.yml | 2 +- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/strings/accessor.py | 10 +++++++--- pandas/io/pytables.py | 4 +++- pandas/io/sas/sas7bdat.py | 6 ++++++ pandas/tests/io/sas/test_sas7bdat.py | 16 ++++++---------- pandas/tests/strings/test_strings.py | 9 +++++---- 7 files changed, 29 insertions(+), 19 deletions(-) diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index fd7c3587f2254..e4b209d83913d 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -7,7 +7,7 @@ runs: shell: bash -el {0} - name: Publish test results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Test results path: test-data.xml diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 3ee54bcc1c6e2..0b7c2bac1be6a 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -35,6 +35,7 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`) +- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 563dce3008480..3c4cf60ab262a 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -13,6 +13,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._typing import ( AlignJoin, @@ -387,7 +389,9 @@ def cons_row(x): # This is a mess. _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) - if self._is_string: + if _dtype is not None: + pass + elif self._is_string: if is_bool_dtype(vdtype): _dtype = result.dtype elif returns_string: @@ -2012,9 +2016,9 @@ def decode(self, encoding, errors: str = "strict"): decoder = codecs.getdecoder(encoding) f = lambda x: decoder(x, errors)[0] arr = self._data.array - # assert isinstance(arr, (StringArray,)) result = arr._str_map(f) - return self._wrap_result(result) + dtype = "str" if get_option("future.infer_string") else None + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def encode(self, encoding, errors: str = "strict"): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1480e0a171147..dbe2db9f9625b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5208,7 +5208,9 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - data = Series(data, copy=False).str.decode(encoding, errors=errors)._values + ser = Series(data, copy=False).str.decode(encoding, errors=errors) + data = ser.to_numpy() + data.flags.writeable = True else: data = data.astype(dtype, copy=False).astype(object, copy=False) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index c5bdfb5541788..1d424425cd927 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -25,6 +25,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs.byteswap import ( read_double_with_byteswap, read_float_with_byteswap, @@ -722,6 +724,7 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt = {} js, jb = 0, 0 + infer_string = get_option("future.infer_string") for j in range(self.column_count): name = self.column_names[j] @@ -738,6 +741,9 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) if self.convert_text and (self.encoding is not None): rslt[name] = self._decode_string(rslt[name].str) + if infer_string: + rslt[name] = rslt[name].astype("str") + js += 1 else: self.close() diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 493971f9f56ef..96aaa1e9bcb21 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import IS64 from pandas.errors import EmptyDataError import pandas.util._test_decorators as td @@ -18,10 +16,6 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture def dirpath(datapath): @@ -254,11 +248,13 @@ def test_zero_variables(datapath): pd.read_sas(fname) -def test_zero_rows(datapath): +@pytest.mark.parametrize("encoding", [None, "utf8"]) +def test_zero_rows(datapath, encoding): # GH 18198 fname = datapath("io", "sas", "data", "zero_rows.sas7bdat") - result = pd.read_sas(fname) - expected = pd.DataFrame([{"char_field": "a", "num_field": 1.0}]).iloc[:0] + result = pd.read_sas(fname, encoding=encoding) + str_value = b"a" if encoding is None else "a" + expected = pd.DataFrame([{"char_field": str_value, "num_field": 1.0}]).iloc[:0] tm.assert_frame_equal(result, expected) @@ -414,7 +410,7 @@ def test_0x40_control_byte(datapath): fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat") df = pd.read_sas(fname, encoding="ascii") fname = datapath("io", "sas", "data", "0x40controlbyte.csv") - df0 = pd.read_csv(fname, dtype="object") + df0 = pd.read_csv(fname, dtype="str") tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 7c396e65b6120..59a06a421f53e 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -95,6 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) + empty_inferred_str = Series(dtype="str") if is_object_or_nan_string_dtype(any_string_dtype): empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) @@ -154,7 +155,7 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) - tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii")) + tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) @@ -564,7 +565,7 @@ def test_string_slice_out_of_bounds(any_string_dtype): def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") - expected = ser.map(lambda x: x.decode("utf-8")).astype(object) + expected = Series(["a", "b", "a\xe4"], dtype="str") tm.assert_series_equal(result, expected) @@ -594,7 +595,7 @@ def test_decode_errors_kwarg(): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object) + expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str") tm.assert_series_equal(result, expected) @@ -749,5 +750,5 @@ def test_get_with_dict_label(): def test_series_str_decode(): # GH 22613 result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict") - expected = Series(["x", "y"], dtype="object") + expected = Series(["x", "y"], dtype="str") tm.assert_series_equal(result, expected) From 8d18e04e239710a744f6900840829f7f536fd9ec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 5 Feb 2025 11:55:58 +0100 Subject: [PATCH 380/396] [backport 2.3.x] TST(string_dtype): Refine scope of string xfail in test_http_headers (#60811) (#60857) Co-authored-by: William Ayd --- pandas/tests/io/test_http_headers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py index 26e1412466e7b..9918435cae15b 100644 --- a/pandas/tests/io/test_http_headers.py +++ b/pandas/tests/io/test_http_headers.py @@ -85,7 +85,6 @@ def stata_responder(df): return bio.getvalue() -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "responder, read_method", [ @@ -108,6 +107,7 @@ def stata_responder(df): td.skip_if_no("fastparquet"), td.skip_if_no("fsspec"), td.skip_array_manager_not_yet_implemented, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string"), ], ), (pickle_respnder, pd.read_pickle), From f3f17cb01a809e5f6299f4fa387c97adeac9126d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 5 Feb 2025 18:38:28 +0100 Subject: [PATCH 381/396] [backport 2.3.x] TST (string): from_dummies, dropna (#60818) (#60856) BUG(string): from_dummies, dropna (#60818) (cherry picked from commit ea7ff0ea4606f47a672f75793f4ea2b3eb0b87f5) Co-authored-by: jbrockmendel --- pandas/tests/frame/methods/test_dropna.py | 8 ++++---- pandas/tests/frame/test_arithmetic.py | 14 ++++++++++---- pandas/tests/reshape/test_from_dummies.py | 7 +++---- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 87a43b4e67c3f..0d4a6a065111f 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -184,10 +182,12 @@ def test_dropna_multiple_axes(self): with pytest.raises(TypeError, match="supplying multiple axes"): inp.dropna(how="all", axis=(0, 1), inplace=True) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_dropna_tz_aware_datetime(self): + def test_dropna_tz_aware_datetime(self, using_infer_string): # GH13407 + df = DataFrame() + if using_infer_string: + df.columns = df.columns.astype("str") dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) df["Time"] = [dt1] diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index eb85c108ca238..195126f1c5382 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,8 +11,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -2128,12 +2127,19 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_mixed_col_index_dtype(): +def test_mixed_col_index_dtype(using_infer_string): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) df1.columns = df2.columns.astype("string") result = df1 + df2 expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) + if using_infer_string: + # df2.columns.dtype will be "str" instead of object, + # so the aligned result will be "string", not object + if HAS_PYARROW: + dtype = "string[pyarrow]" + else: + dtype = "string" + expected.columns = expected.columns.astype(dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 6009b263a83c5..59c81c545697a 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -363,7 +361,6 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "default_category, expected", [ @@ -400,11 +397,13 @@ def test_with_prefix_contains_get_dummies_NaN_column(): ], ) def test_with_prefix_default_category( - dummies_with_unassigned, default_category, expected + dummies_with_unassigned, default_category, expected, using_infer_string ): result = from_dummies( dummies_with_unassigned, sep="_", default_category=default_category ) + if using_infer_string: + expected = expected.astype("str") tm.assert_frame_equal(result, expected) From e3df72aba259d46cacd8c9bb15fed1ddc6baa30c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Feb 2025 11:50:50 -0800 Subject: [PATCH 382/396] Backport PR #60873: TST/CI: xfail test_frame_setitem_dask_array_into_new_col for numpy>2.1 (#60876) --- pandas/tests/test_downstream.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 51ce73ef54300..d448773c3bd4a 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -23,6 +23,7 @@ DatetimeArray, TimedeltaArray, ) +from pandas.util.version import Version @pytest.fixture @@ -223,7 +224,7 @@ def test_missing_required_dependency(): assert name in output -def test_frame_setitem_dask_array_into_new_col(): +def test_frame_setitem_dask_array_into_new_col(request): # GH#47128 # dask sets "compute.use_numexpr" to False, so catch the current value @@ -231,7 +232,14 @@ def test_frame_setitem_dask_array_into_new_col(): olduse = pd.get_option("compute.use_numexpr") try: + dask = pytest.importorskip("dask") da = pytest.importorskip("dask.array") + if Version(dask.__version__) <= Version("2025.1.0") and Version( + np.__version__ + ) >= Version("2.1"): + request.applymarker( + pytest.mark.xfail(reason="loc.__setitem__ incorrectly mutated column c") + ) dda = da.array([1, 2]) df = DataFrame({"a": ["a", "b"]}) From e276e89a46bcd2d41da12b71d791df2224d339de Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 7 Feb 2025 15:25:26 -0800 Subject: [PATCH 383/396] Backport PR #60875 on branch 2.3.x (TST/CI: Address enforced numpy DeprecationWarning in test_pandas_dtype_numpy_warning) (#60878) Backport PR #60875: TST/CI: Address enforced numpy DeprecationWarning in test_pandas_dtype_numpy_warning Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/dtypes/common.py | 2 ++ pandas/tests/dtypes/test_common.py | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index fe705daaad5fa..873fb7676ed12 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1654,6 +1654,8 @@ def pandas_dtype(dtype) -> DtypeObj: # raise a consistent TypeError if failed try: with warnings.catch_warnings(): + # TODO: warnings.catch_warnings can be removed when numpy>2.2.2 + # is the minimum version # GH#51523 - Series.astype(np.integer) doesn't show # numpy deprecation warning of np.integer # Hence enabling DeprecationWarning diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ceebfb1920594..8f87878b439f9 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -22,6 +22,7 @@ import pandas._testing as tm from pandas.api.types import pandas_dtype from pandas.arrays import SparseArray +from pandas.util.version import Version # EA & Actual Dtypes @@ -788,11 +789,18 @@ def test_validate_allhashable(): def test_pandas_dtype_numpy_warning(): # GH#51523 - with tm.assert_produces_warning( - DeprecationWarning, - check_stacklevel=False, - match="Converting `np.integer` or `np.signedinteger` to a dtype is deprecated", - ): + if Version(np.__version__) <= Version("2.2.2"): + ctx = tm.assert_produces_warning( + DeprecationWarning, + check_stacklevel=False, + match=( + "Converting `np.integer` or `np.signedinteger` to a dtype is deprecated" + ), + ) + else: + ctx = tm.external_error_raised(TypeError) + + with ctx: pandas_dtype(np.integer) From f1a4d76e335acd10d5e50cdbf55a27e7da73a003 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Feb 2025 17:08:12 -0800 Subject: [PATCH 384/396] Backport PR #60847 on branch 2.3.x (TST/CI: skipif numba tests on Ubuntu ARM for numba 0.61) (#60880) Backport PR #60847: TST/CI: skipif numba tests on Ubuntu ARM for numba 0.61 --- pandas/tests/apply/test_frame_apply.py | 6 ++++++ pandas/tests/apply/test_numba.py | 12 +++++++++++- pandas/tests/frame/methods/test_info.py | 6 +++++- pandas/tests/groupby/aggregate/test_numba.py | 12 +++++++++++- pandas/tests/groupby/test_numba.py | 13 +++++++++++-- pandas/tests/groupby/transform/test_numba.py | 12 +++++++++++- pandas/tests/window/test_numba.py | 12 +++++++++++- pandas/tests/window/test_online.py | 13 +++++++++++-- 8 files changed, 77 insertions(+), 9 deletions(-) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index b7eac6b8f0ea1..1a776892b7bb7 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -16,6 +18,7 @@ ) import pandas._testing as tm from pandas.tests.frame.common import zip_frames +from pandas.util.version import Version @pytest.fixture @@ -65,6 +68,9 @@ def test_apply(float_frame, engine, request): @pytest.mark.parametrize("raw", [True, False]) def test_apply_args(float_frame, axis, raw, engine, request): if engine == "numba": + numba = pytest.importorskip("numba") + if Version(numba.__version__) == Version("0.61") and is_platform_arm(): + pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}") mark = pytest.mark.xfail(reason="numba engine doesn't support args") request.node.add_marker(mark) result = float_frame.apply( diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 20c067a776f4d..c211073f75888 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm import pandas.util._test_decorators as td import pandas as pd @@ -9,8 +10,17 @@ Index, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu] +pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu, pytest.mark.skipif()] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.fixture(params=[0, 1]) diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index f0ae00fa6febb..c2d15e5ae88e8 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -13,6 +13,7 @@ HAS_PYARROW, IS64, PYPY, + is_platform_arm, ) from pandas import ( @@ -25,6 +26,7 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version @pytest.fixture @@ -547,7 +549,9 @@ def test_memory_usage_empty_no_warning(using_infer_string): @pytest.mark.single_cpu def test_info_compute_numba(): # GH#51922 - pytest.importorskip("numba") + numba = pytest.importorskip("numba") + if Version(numba.__version__) == Version("0.61") and is_platform_arm(): + pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}") df = DataFrame([[1, 2], [3, 4]]) with option_context("compute.use_numba", True): diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index ee694129f7118..fcd34f793c584 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError from pandas import ( @@ -11,8 +12,17 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) def test_correct_function_signature(): diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index ee7d342472493..f2c138c86a046 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -1,15 +1,24 @@ import pytest +from pandas.compat import is_platform_arm + from pandas import ( DataFrame, Series, option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] -pytest.importorskip("numba") +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.mark.filterwarnings("ignore") diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 61fcc930f116a..5afc6f3bdcd3c 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError from pandas import ( @@ -9,8 +10,17 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) def test_correct_function_signature(): diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 139e1ff7f65fd..9ee7ed0c2f3e6 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td @@ -11,8 +12,17 @@ to_datetime, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.fixture(params=["single", "table"]) diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py index 14d3a39107bc4..43d55a7992b3c 100644 --- a/pandas/tests/window/test_online.py +++ b/pandas/tests/window/test_online.py @@ -1,15 +1,24 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm + from pandas import ( DataFrame, Series, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] -pytest.importorskip("numba") +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.mark.filterwarnings("ignore") From 36ae10d8f989eaa511ab1ffeee657288da75ebbf Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 10 Feb 2025 17:28:24 -0500 Subject: [PATCH 385/396] =?UTF-8?q?ENH:=20Improved=20error=20message=20and?= =?UTF-8?q?=20raise=20new=20error=20for=20small-string=20NaN=20=E2=80=A6?= =?UTF-8?q?=20(#60907)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ecd08580f26ee4a976c1f68b2f563c41569) Co-authored-by: Jake Thomas Trevallion <136272202+JakeTT404@users.noreply.github.com> --- pandas/io/pytables.py | 9 ++++++ pandas/tests/io/pytables/test_append.py | 35 +++++++++++++++++---- pandas/tests/io/pytables/test_round_trip.py | 9 ++---- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index dbe2db9f9625b..6d5202c58a0d5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3464,6 +3464,12 @@ def validate(self, other) -> None: # Value of type "Optional[Any]" is not indexable [index] oax = ov[i] # type: ignore[index] if sax != oax: + if c == "values_axes" and sax.kind != oax.kind: + raise ValueError( + f"Cannot serialize the column [{oax.values[0]}] " + f"because its data contents are not [{sax.kind}] " + f"but [{oax.kind}] object dtype" + ) raise ValueError( f"invalid combination of [{c}] on appending data " f"[{sax}] vs current table [{oax}]" @@ -5111,6 +5117,9 @@ def _maybe_convert_for_string_atom( data = bvalues.copy() data[mask] = nan_rep + if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize: + raise ValueError("NaN representation is too large for existing column size") + # see if we have a valid string type inferred_type = lib.infer_dtype(data, skipna=False) if inferred_type != "string": diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 93e50455fe6a2..fd2deacb69b3c 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -818,12 +818,9 @@ def test_append_raise(setup_path): store.append("df", df) df["foo"] = "bar" msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_1,cname->values_block_1," - "dtype->bytes24,kind->string,shape->(1, 30)] " - "vs current table " - "[name->values_block_1,cname->values_block_1," - "dtype->datetime64[s],kind->datetime64[s],shape->None]" + "Cannot serialize the column [foo] " + "because its data contents are not [string] " + "but [datetime64[s]] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df", df) @@ -989,3 +986,29 @@ def test_append_to_multiple_min_itemsize(setup_path): ) result = store.select_as_multiple(["index", "nums", "strs"]) tm.assert_frame_equal(result, expected, check_index_type=True) + + +def test_append_string_nan_rep(setup_path): + # GH 16300 + df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10)) + df_nan = df.copy() + df_nan.loc[0:4, :] = np.nan + msg = "NaN representation is too large for existing column size" + + with ensure_clean_store(setup_path) as store: + # string column too small + store.append("sa", df["A"]) + with pytest.raises(ValueError, match=msg): + store.append("sa", df_nan["A"]) + + # nan_rep too big + store.append("sb", df["B"], nan_rep="bars") + with pytest.raises(ValueError, match=msg): + store.append("sb", df_nan["B"]) + + # smaller modified nan_rep + store.append("sc", df["A"], nan_rep="n") + store.append("sc", df_nan["A"]) + result = store["sc"] + expected = concat([df["A"], df_nan["A"]]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 2397d18b1019e..72d90b1273d65 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -213,12 +213,9 @@ def test_table_values_dtypes_roundtrip(setup_path): # incompatible dtype msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_0,cname->values_block_0," - "dtype->float64,kind->float,shape->(1, 3)] vs " - "current table [name->values_block_0," - "cname->values_block_0,dtype->int64,kind->integer," - "shape->None]" + "Cannot serialize the column [a] " + "because its data contents are not [float] " + "but [integer] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df_i8", df1) From 79e203e00f7f328e136f846f2eef03cdc7a06a1e Mon Sep 17 00:00:00 2001 From: SALCAN <68040183+sanggon6107@users.noreply.github.com> Date: Wed, 12 Feb 2025 11:28:08 +0900 Subject: [PATCH 386/396] BUG: Prevent pd.Series.groupby from showing FutureWarning (#60894) Call is_in_obj only when obj is a DataFrame at get_grouper() --- pandas/core/groupby/grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e2224caad9e84..4bf2e8b90a0b0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -1023,7 +1023,7 @@ def is_in_obj(gpr) -> bool: return False for gpr, level in zip(keys, levels): - if is_in_obj(gpr): # df.groupby(df['name']) + if isinstance(obj, DataFrame) and is_in_obj(gpr): # df.groupby(df['name']) in_axis = True exclusions.add(gpr.name) From 12cd45e9e4cc8518175832eaaa450e52f391ce69 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 13 Feb 2025 17:54:27 -0800 Subject: [PATCH 387/396] Backport PR #60929 on branch 2.3.x (TST: Update numpy version check for test_pandas_dtype_numpy_warning) (#60930) Backport PR #60929: TST: Update numpy version check for test_pandas_dtype_numpy_warning Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/dtypes/common.py | 2 +- pandas/tests/dtypes/test_common.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 873fb7676ed12..6dea15ac0bc24 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1654,7 +1654,7 @@ def pandas_dtype(dtype) -> DtypeObj: # raise a consistent TypeError if failed try: with warnings.catch_warnings(): - # TODO: warnings.catch_warnings can be removed when numpy>2.2.2 + # TODO: warnings.catch_warnings can be removed when numpy>2.3.0 # is the minimum version # GH#51523 - Series.astype(np.integer) doesn't show # numpy deprecation warning of np.integer diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8f87878b439f9..579f5636922dc 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -789,7 +789,7 @@ def test_validate_allhashable(): def test_pandas_dtype_numpy_warning(): # GH#51523 - if Version(np.__version__) <= Version("2.2.2"): + if Version(np.__version__) < Version("2.3.0.dev0"): ctx = tm.assert_produces_warning( DeprecationWarning, check_stacklevel=False, From 3143f441e2a582758a5373a44bd0e67dd6d57d9b Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 16 Feb 2025 12:39:03 -0500 Subject: [PATCH 388/396] Backport PR #60795: TST(string dtype): Resolve xfails in pytables (#60916) * ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ecd08580f26ee4a976c1f68b2f563c41569) * TST(string dtype): Resolve xfails in pytables (#60795) (cherry picked from commit 4511251ccf409f2ba71cab0283bdf751697ee539) * Adjust test --------- Co-authored-by: Jake Thomas Trevallion <136272202+JakeTT404@users.noreply.github.com> --- pandas/io/pytables.py | 3 + pandas/tests/io/pytables/test_append.py | 56 ++++++++------- pandas/tests/io/pytables/test_categorical.py | 6 +- pandas/tests/io/pytables/test_complex.py | 6 -- pandas/tests/io/pytables/test_errors.py | 18 ++--- .../tests/io/pytables/test_file_handling.py | 10 +-- pandas/tests/io/pytables/test_keys.py | 7 +- pandas/tests/io/pytables/test_put.py | 4 +- pandas/tests/io/pytables/test_read.py | 16 +++-- pandas/tests/io/pytables/test_round_trip.py | 49 +++++++------ pandas/tests/io/pytables/test_select.py | 44 ++++++------ pandas/tests/io/pytables/test_store.py | 71 ++++++++++--------- pandas/tests/io/pytables/test_timezones.py | 6 -- 13 files changed, 145 insertions(+), 151 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6d5202c58a0d5..d93a3f26934a0 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5093,6 +5093,9 @@ def _maybe_convert_for_string_atom( errors, columns: list[str], ): + if isinstance(bvalues.dtype, StringDtype): + # "ndarray[Any, Any]" has no attribute "to_numpy" + bvalues = bvalues.to_numpy() # type: ignore[union-attr] if bvalues.dtype != object: return bvalues diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index fd2deacb69b3c..39c203c558a5b 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -25,10 +25,7 @@ ensure_clean_store, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -40,7 +37,7 @@ def test_append(setup_path): # tables.NaturalNameWarning): df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) _maybe_remove(store, "df1") @@ -201,7 +198,7 @@ def test_append_some_nans(setup_path): tm.assert_frame_equal(store["df3"], df3, check_index_type=True) -def test_append_all_nans(setup_path): +def test_append_all_nans(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df = DataFrame( { @@ -253,7 +250,13 @@ def test_append_all_nans(setup_path): _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df, check_index_type=True) + result = store["df"] + expected = df + if using_infer_string: + # TODO: Test is incorrect when not using_infer_string. + # Should take the last 4 rows uncondiationally. + expected = expected[-4:] + tm.assert_frame_equal(result, expected, check_index_type=True) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) @@ -292,7 +295,7 @@ def test_append_frame_column_oriented(setup_path): # column oriented df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.index = df.index._with_freq(None) # freq doesn't round-trip @@ -417,7 +420,7 @@ def check_col(key, name, size): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -444,7 +447,7 @@ def check_col(key, name, size): _maybe_remove(store, "df") df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -504,11 +507,12 @@ def test_append_with_empty_string(setup_path): tm.assert_frame_equal(store.select("df"), df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_append_with_data_columns(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.iloc[0, df.columns.get_loc("B")] = 1.0 @@ -684,8 +688,8 @@ def test_append_misc(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df, chunksize=1) result = store.select("df") @@ -701,8 +705,8 @@ def test_append_misc_chunksize(setup_path, chunksize): # more chunksize in append tests df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["string"] = "foo" df["float322"] = 1.0 @@ -742,15 +746,15 @@ def test_append_misc_empty_frame(setup_path): # the conversion from AM->BM converts the invalid object dtype column into # a datetime64 column no longer raising an error @td.skip_array_manager_not_yet_implemented -def test_append_raise(setup_path): +def test_append_raise(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: # test append with invalid input to get good error messages # list in column df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ @@ -770,8 +774,8 @@ def test_append_raise(setup_path): # datetime with embedded nans as object df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) @@ -798,8 +802,8 @@ def test_append_raise(setup_path): # appending an incompatible table df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df) @@ -876,7 +880,7 @@ def test_append_with_timedelta(setup_path): def test_append_to_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -913,12 +917,12 @@ def test_append_to_multiple(setup_path): def test_append_to_multiple_dropna(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ).rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan @@ -938,7 +942,7 @@ def test_append_to_multiple_dropna(setup_path): def test_append_to_multiple_dropna_false(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 07c797467e5e2..a875e19ea7f0e 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -16,10 +16,7 @@ ensure_clean_store, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_categorical(setup_path): @@ -143,6 +140,7 @@ def test_categorical(setup_path): store.select("df3/meta/s/meta") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_categorical_conversion(tmp_path, setup_path): # GH13322 # Check that read_hdf with categorical columns doesn't return rows if diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index d140cfc941e16..c5cac5a5caf09 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -13,10 +11,6 @@ from pandas.io.pytables import read_hdf -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def test_complex_fixed(tmp_path, setup_path): df = DataFrame( diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index c31b9989ef35e..b28101c09820f 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( CategoricalIndex, DataFrame, @@ -24,10 +22,7 @@ _maybe_adjust_name, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_pass_spec_to_storer(setup_path): @@ -93,9 +88,14 @@ def test_unimplemented_dtypes_table_columns(setup_path): with ensure_clean_store(setup_path) as store: # this fails because we have a date in the object block...... - msg = re.escape( - """Cannot serialize the column [datetime1] -because its data contents are not [string] but [date] object dtype""" + msg = "|".join( + [ + re.escape( + "Cannot serialize the column [datetime1]\nbecause its data " + "contents are not [string] but [date] object dtype" + ), + re.escape("[date] is not implemented as a table column"), + ] ) with pytest.raises(TypeError, match=msg): store.append("df_unimplemented", df) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 1878f2a392e13..100a55e6e346d 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( PY311, is_ci_environment, @@ -34,9 +32,7 @@ from pandas.io import pytables from pandas.io.pytables import Term -pytestmark = [ - pytest.mark.single_cpu, -] +pytestmark = [pytest.mark.single_cpu] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) @@ -323,7 +319,6 @@ def test_complibs(tmp_path, lvl, lib, request): assert node.filters.complib == lib -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) @@ -341,7 +336,6 @@ def test_encoding(setup_path): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "val", [ @@ -356,7 +350,7 @@ def test_encoding(setup_path): [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], ], ) -@pytest.mark.parametrize("dtype", ["category", object]) +@pytest.mark.parametrize("dtype", ["category", None]) def test_latin_encoding(tmp_path, setup_path, dtype, val): enc = "latin-1" nan_rep = "" diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 7d0802dcf2e47..9c5fc8786c7c6 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, HDFStore, @@ -15,10 +13,7 @@ tables, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_keys(setup_path): diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index 38f0379eb9a66..36ca68eb227a6 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -22,9 +22,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = [ - pytest.mark.single_cpu, -] +pytestmark = [pytest.mark.single_cpu] def test_format_type(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 28cd8aea1defc..bfebf18c0e0ab 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -28,10 +28,7 @@ from pandas.io.pytables import TableIterator -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_read_missing_key_close_store(tmp_path, setup_path): @@ -77,10 +74,11 @@ def test_read_missing_key_opened_store(tmp_path, setup_path): read_hdf(store, "k1") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_read_column(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -221,7 +219,7 @@ def test_legacy_table_read_py2(datapath): tm.assert_frame_equal(expected, result) -def test_read_hdf_open_store(tmp_path, setup_path): +def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string): # GH10330 # No check for non-string path_or-buf, and no test of open store df = DataFrame( @@ -233,6 +231,12 @@ def test_read_hdf_open_store(tmp_path, setup_path): df = df.set_index(keys="E", append=True) path = tmp_path / setup_path + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + df.to_hdf(path, key="df", mode="w") + return df.to_hdf(path, key="df", mode="w") direct = read_hdf(path, "df") with HDFStore(path, mode="r") as store: diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 72d90b1273d65..040708c9cedd0 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -26,10 +24,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_conv_read_write(): @@ -49,8 +44,8 @@ def roundtrip(key, obj, **kwargs): o = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) tm.assert_frame_equal(o, roundtrip("frame", o)) @@ -150,8 +145,8 @@ def test_api_invalid(tmp_path, setup_path): # Invalid. df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) msg = "Can only append to Tables" @@ -201,7 +196,7 @@ def test_put_integer(setup_path): _check_roundtrip(df, tm.assert_frame_equal, setup_path) -def test_table_values_dtypes_roundtrip(setup_path): +def test_table_values_dtypes_roundtrip(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") store.append("df_f8", df1) @@ -244,6 +239,7 @@ def test_table_values_dtypes_roundtrip(setup_path): store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() result.index = [str(i) for i in result.index] + str_dtype = "str" if using_infer_string else "object" expected = Series( { "float32": 2, @@ -253,7 +249,7 @@ def test_table_values_dtypes_roundtrip(setup_path): "int16": 1, "int8": 1, "int64": 1, - "object": 1, + str_dtype: 1, "datetime64[ns]": 2, }, name="count", @@ -273,10 +269,10 @@ def test_series(setup_path): ) _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - ts2 = Series(ts.index, Index(ts.index, dtype=object)) + ts2 = Series(ts.index, Index(ts.index)) _check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + ts3 = Series(ts.values, Index(np.asarray(ts.index))) _check_roundtrip( ts3, tm.assert_series_equal, path=setup_path, check_index_type=False ) @@ -366,8 +362,8 @@ def test_timeseries_preepoch(setup_path, request): def test_frame(compression, setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # put in some random NAs @@ -383,7 +379,7 @@ def test_frame(compression, setup_path): tdf = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _check_roundtrip( @@ -398,7 +394,10 @@ def test_frame(compression, setup_path): assert recons._mgr.is_consolidated() # empty - _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) + df2 = df[:0] + # Prevent df2 from having index with inferred_type as string + df2.index = Index([]) + _check_roundtrip(df2[:0], tm.assert_frame_equal, path=setup_path) def test_empty_series_frame(setup_path): @@ -430,9 +429,17 @@ def test_can_serialize_dates(setup_path): _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) -def test_store_hierarchical(setup_path, multiindex_dataframe_random_data): +def test_store_hierarchical( + setup_path, using_infer_string, multiindex_dataframe_random_data +): frame = multiindex_dataframe_random_data + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + return _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) @@ -451,8 +458,8 @@ def test_store_mixed(compression, setup_path): def _make_one(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 9f403f8293aed..f781b6756fec9 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -26,10 +26,7 @@ from pandas.io.pytables import Term -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_select_columns_in_where(setup_path): @@ -137,7 +134,7 @@ def test_select(setup_path): # select with columns= df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -277,8 +274,8 @@ def test_select_dtypes(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) expected = df[df["A"] > 0] @@ -342,7 +339,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -367,7 +364,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df_non_table") @@ -383,7 +380,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df", format="table") @@ -400,7 +397,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df1", df1, data_columns=True) @@ -428,7 +425,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -463,7 +460,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -505,7 +502,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -539,7 +536,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -563,7 +560,7 @@ def test_select_iterator_many_empty_frames(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -615,7 +612,7 @@ def test_select_iterator_many_empty_frames(setup_path): def test_frame_select(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -640,7 +637,7 @@ def test_frame_select(setup_path): # invalid terms df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df_time", df) @@ -654,12 +651,13 @@ def test_frame_select(setup_path): # store.select('frame', [crit1, crit2]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_select_complex(setup_path): # select via complex criteria df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -776,7 +774,7 @@ def test_invalid_filtering(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -798,7 +796,7 @@ def test_string_select(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -842,7 +840,7 @@ def test_string_select(setup_path): def test_select_as_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -967,6 +965,7 @@ def test_query_long_float_literal(setup_path): tm.assert_frame_equal(expected, result) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_query_compare_column_type(setup_path): # GH 15492 df = DataFrame( @@ -1043,7 +1042,6 @@ def test_select_large_integer(tmp_path): ), columns=["x", "y"], ) - result = None with HDFStore(path) as s: s.append("data", df, data_columns=True, index=False) result = s.select("data", where="y==-9223372036854775801").get("y").get(0) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 8a33cccf62fcf..f51d61e2d633c 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -9,6 +9,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + import pandas as pd from pandas import ( DataFrame, @@ -33,10 +35,7 @@ read_hdf, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -108,7 +107,7 @@ def test_iter_empty(setup_path): assert list(store) == [] -def test_repr(setup_path): +def test_repr(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: repr(store) store.info() @@ -143,7 +142,9 @@ def test_repr(setup_path): df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate() - with tm.assert_produces_warning(pd.errors.PerformanceWarning): + warning = None if using_infer_string else pd.errors.PerformanceWarning + msg = "cannot\nmap directly to c-types .* dtype='object'" + with tm.assert_produces_warning(warning, match=msg): store["df"] = df # make a random group in hdf space @@ -314,7 +315,7 @@ def test_getattr(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store["df"] = df @@ -381,7 +382,7 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -397,6 +398,10 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, + reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed", +) @pytest.mark.parametrize("format", ["fixed", "table"]) def test_to_hdf_errors(tmp_path, format, setup_path): data = ["\ud800foo"] @@ -418,7 +423,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -453,7 +458,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -495,8 +500,8 @@ def test_table_mixed_dtypes(setup_path): # frame df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" @@ -551,8 +556,8 @@ def test_remove(setup_path): ) df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store["a"] = ts store["b"] = df @@ -615,8 +620,8 @@ def test_same_name_scoping(setup_path): def test_store_index_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.index.name = "foo" @@ -658,8 +663,8 @@ def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz def test_store_series_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) series = df["A"] @@ -673,7 +678,7 @@ def test_overwrite_node(setup_path): with ensure_clean_store(setup_path) as store: store["a"] = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) ts = Series( @@ -687,7 +692,7 @@ def test_overwrite_node(setup_path): def test_coordinates(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -722,7 +727,7 @@ def test_coordinates(setup_path): _maybe_remove(store, "df2") df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -878,8 +883,8 @@ def test_start_stop_fixed(setup_path): # sparse; not implemented df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan @@ -905,8 +910,8 @@ def test_select_filter_corner(setup_path): def test_path_pathlib(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( @@ -935,8 +940,8 @@ def test_contiguous_mixed_data_table(start, stop, setup_path): def test_path_pathlib_hdfstore(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) def writer(path): @@ -954,8 +959,8 @@ def reader(path): def test_pickle_path_localpath(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( lambda p: df.to_hdf(p, key="df"), lambda p: read_hdf(p, "df") @@ -966,8 +971,8 @@ def test_pickle_path_localpath(): def test_path_localpath_hdfstore(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) def writer(path): @@ -986,8 +991,8 @@ def reader(path): def test_copy(propindexes): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 05d630dc0e47c..c5613daf62207 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs.timezones import maybe_get_tz import pandas.util._test_decorators as td @@ -25,10 +23,6 @@ ensure_clean_store, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) From b8624cb1f426b1e6e6aa12f798e014809658adc1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 20 Feb 2025 12:30:58 -0500 Subject: [PATCH 389/396] Backport PR #60943: BUG(string dtype): Resolve pytables xfail when reading with condition (#60967) * ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ecd08580f26ee4a976c1f68b2f563c41569) * TST(string dtype): Resolve xfails in pytables (#60795) (cherry picked from commit 4511251ccf409f2ba71cab0283bdf751697ee539) * BUG(string dtype): Resolve pytables xfail when reading with condition (#60943) (cherry picked from commit 0ec5f2668e9568d90595180d5ee925305ec7182e) --------- Co-authored-by: Jake Thomas Trevallion <136272202+JakeTT404@users.noreply.github.com> --- pandas/io/pytables.py | 18 ++++++++++++++++-- pandas/tests/io/pytables/test_append.py | 3 --- pandas/tests/io/pytables/test_categorical.py | 3 --- pandas/tests/io/pytables/test_read.py | 3 --- pandas/tests/io/pytables/test_select.py | 4 ---- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d93a3f26934a0..65f95dab7b42f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4093,6 +4093,8 @@ def _create_axes( ordered = data_converted.ordered meta = "category" metadata = np.asarray(data_converted.categories).ravel() + elif isinstance(blk.dtype, StringDtype): + meta = str(blk.dtype) data, dtype_name = _get_data_and_dtype_name(data_converted) @@ -4360,7 +4362,9 @@ def read_column( encoding=self.encoding, errors=self.errors, ) - return Series(_set_tz(col_values[1], a.tz), name=column, copy=False) + cvs = _set_tz(col_values[1], a.tz) + dtype = getattr(self.table.attrs, f"{column}_meta", None) + return Series(cvs, name=column, copy=False, dtype=dtype) raise KeyError(f"column [{column}] not found in the table") @@ -4708,8 +4712,18 @@ def read( df = DataFrame._from_arrays([values], columns=cols_, index=index_) if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) + + # If str / string dtype is stored in meta, use that. + converted = False + for column in cols_: + dtype = getattr(self.table.attrs, f"{column}_meta", None) + if dtype in ["str", "string"]: + df[column] = df[column].astype(dtype) + converted = True + # Otherwise try inference. if ( - using_string_dtype() + not converted + and using_string_dtype() and isinstance(values, np.ndarray) and is_string_array( values, diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 39c203c558a5b..d0246c8f58d6a 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp import pandas.util._test_decorators as td @@ -507,7 +505,6 @@ def test_append_with_empty_string(setup_path): tm.assert_frame_equal(store.select("df"), df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_append_with_data_columns(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index a875e19ea7f0e..449bc5cf1fc57 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( Categorical, DataFrame, @@ -140,7 +138,6 @@ def test_categorical(setup_path): store.select("df3/meta/s/meta") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_categorical_conversion(tmp_path, setup_path): # GH13322 # Check that read_hdf with categorical columns doesn't return rows if diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index bfebf18c0e0ab..5bec673ad3c70 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -74,7 +72,6 @@ def test_read_missing_key_opened_store(tmp_path, setup_path): read_hdf(store, "k1") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_read_column(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index f781b6756fec9..e76934745f004 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp import pandas as pd @@ -651,7 +649,6 @@ def test_frame_select(setup_path): # store.select('frame', [crit1, crit2]) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_select_complex(setup_path): # select via complex criteria @@ -965,7 +962,6 @@ def test_query_long_float_literal(setup_path): tm.assert_frame_equal(expected, result) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_query_compare_column_type(setup_path): # GH 15492 df = DataFrame( From 81229e610da43e7de0bf6a63134608205cc517f5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 20 Feb 2025 12:32:04 -0500 Subject: [PATCH 390/396] Backport PR #60940: ENH: Add dtype argument to str.decode (#60968) * ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ecd08580f26ee4a976c1f68b2f563c41569) * TST(string dtype): Resolve xfails in pytables (#60795) (cherry picked from commit 4511251ccf409f2ba71cab0283bdf751697ee539) * BUG(string dtype): Resolve pytables xfail when reading with condition (#60943) (cherry picked from commit 0ec5f2668e9568d90595180d5ee925305ec7182e) * Backport PR #60940: ENH: Add dtype argument to str.decode --------- Co-authored-by: Jake Thomas Trevallion <136272202+JakeTT404@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/strings/accessor.py | 18 ++++++++++++++++-- pandas/tests/strings/test_strings.py | 24 ++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 0b7c2bac1be6a..c4e01a86ce843 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -37,6 +37,7 @@ Other enhancements updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`) - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) +- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 3c4cf60ab262a..c0e458f7968e7 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -33,6 +33,7 @@ is_list_like, is_object_dtype, is_re, + is_string_dtype, ) from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -1981,7 +1982,9 @@ def slice_replace(self, start=None, stop=None, repl=None): result = self._data.array._str_slice_replace(start, stop, repl) return self._wrap_result(result) - def decode(self, encoding, errors: str = "strict"): + def decode( + self, encoding, errors: str = "strict", dtype: str | DtypeObj | None = None + ): """ Decode character string in the Series/Index using indicated encoding. @@ -1992,6 +1995,14 @@ def decode(self, encoding, errors: str = "strict"): ---------- encoding : str errors : str, optional + Specifies the error handling scheme. + Possible values are those supported by :meth:`bytes.decode`. + dtype : str or dtype, optional + The dtype of the result. When not ``None``, must be either a string or + object dtype. When ``None``, the dtype of the result is determined by + ``pd.options.future.infer_string``. + + .. versionadded:: 2.3.0 Returns ------- @@ -2008,6 +2019,10 @@ def decode(self, encoding, errors: str = "strict"): 2 () dtype: object """ + if dtype is not None and not is_string_dtype(dtype): + raise ValueError(f"dtype must be string or object, got {dtype=}") + if dtype is None and get_option("future.infer_string"): + dtype = "str" # TODO: Add a similar _bytes interface. if encoding in _cpython_optimized_decoders: # CPython optimized implementation @@ -2017,7 +2032,6 @@ def decode(self, encoding, errors: str = "strict"): f = lambda x: decoder(x, errors)[0] arr = self._data.array result = arr._str_map(f) - dtype = "str" if get_option("future.infer_string") else None return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 59a06a421f53e..c729b910d05a7 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -599,6 +599,30 @@ def test_decode_errors_kwarg(): tm.assert_series_equal(result, expected) +def test_decode_string_dtype(string_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", b"b"]) + result = ser.str.decode("utf-8", dtype=string_dtype) + expected = Series(["a", "b"], dtype=string_dtype) + tm.assert_series_equal(result, expected) + + +def test_decode_object_dtype(object_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", rb"\ud800"]) + result = ser.str.decode("utf-8", dtype=object_dtype) + expected = Series(["a", r"\ud800"], dtype=object_dtype) + tm.assert_series_equal(result, expected) + + +def test_decode_bad_dtype(): + # https://github.com/pandas-dev/pandas/pull/60940 + ser = Series([b"a", b"b"]) + msg = "dtype must be string or object, got dtype='int64'" + with pytest.raises(ValueError, match=msg): + ser.str.decode("utf-8", dtype="int64") + + @pytest.mark.parametrize( "form, expected", [ From 684a1a3056a9aac89b7438fe41fcd81d1a1b2158 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 22 Feb 2025 13:51:16 -0500 Subject: [PATCH 391/396] Backport PR #60938: ENH(string dtype): Implement cumsum for Python-backed strings (#60984) * ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ecd08580f26ee4a976c1f68b2f563c41569) * TST(string dtype): Resolve xfails in pytables (#60795) (cherry picked from commit 4511251ccf409f2ba71cab0283bdf751697ee539) * BUG(string dtype): Resolve pytables xfail when reading with condition (#60943) (cherry picked from commit 0ec5f2668e9568d90595180d5ee925305ec7182e) * Backport PR #60940: ENH: Add dtype argument to str.decode * Backport PR #60938: ENH(string dtype): Implement cumsum for Python-backed strings --------- Co-authored-by: Jake Thomas Trevallion <136272202+JakeTT404@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/string_.py | 83 ++++++++++++++++++++++++++ pandas/tests/apply/test_str.py | 15 +---- pandas/tests/extension/test_string.py | 6 +- pandas/tests/series/test_cumulative.py | 11 ++-- 5 files changed, 92 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index c4e01a86ce843..db3dcb50bacd0 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -38,7 +38,7 @@ Other enhancements - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) -- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3efb48c86e92c..c1048e806ff9a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -46,6 +46,7 @@ ) from pandas.core import ( + missing, nanops, ops, ) @@ -865,6 +866,88 @@ def _reduce( return result raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray: + """ + Return an ExtensionArray performing an accumulation operation. + + The underlying data type might change. + + Parameters + ---------- + name : str + Name of the function, supported values are: + - cummin + - cummax + - cumsum + - cumprod + skipna : bool, default True + If True, skip NA values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, there is no supported kwarg. + + Returns + ------- + array + + Raises + ------ + NotImplementedError : subclass does not define accumulations + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: np.ndarray | None = None + na_mask: np.ndarray | None = None + ndarray = self._ndarray + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray)) + if np.all(na_mask): + return type(self)(ndarray) + if skipna: + if name == "cumsum": + ndarray = np.where(na_mask, "", ndarray) + else: + # We can retain the running min/max by forward/backward filling. + ndarray = ndarray.copy() + missing.pad_or_backfill_inplace( + ndarray, + method="pad", + axis=0, + ) + missing.pad_or_backfill_inplace( + ndarray, + method="backfill", + axis=0, + ) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = np.argmax(na_mask) + tail = np.empty(len(ndarray) - idx, dtype="object") + tail[:] = self.dtype.na_value + ndarray = ndarray[:idx] + + # mypy: Cannot call function of unknown type + np_result = np_func(ndarray) # type: ignore[operator] + + if tail is not None: + np_result = np.hstack((np_result, tail)) + elif na_mask is not None: + # Argument 2 to "where" has incompatible type "NAType | float" + np_result = np.where(na_mask, self.dtype.na_value, np_result) # type: ignore[arg-type] + + result = type(self)(np_result) + return result + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: if self.dtype.na_value is np.nan and result is libmissing.NA: # the masked_reductions use pd.NA -> convert to np.nan diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 9c7836a0aa167..17e8322dc40e1 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.common import is_number from pandas import ( @@ -168,21 +166,10 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(request, series, func, expected): +def test_agg_cython_table_transform_series(series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if ( - series.dtype == "string" - and func in ("cumsum", np.cumsum, np.nancumsum) - and not HAS_PYARROW - ): - request.applymarker( - pytest.mark.xfail( - raises=NotImplementedError, - reason="TODO(infer_string) cumsum not yet implemented for string", - ) - ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 301c7ee851aa0..526cf426781ad 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -200,11 +200,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: assert isinstance(ser.dtype, StorageExtensionDtype) - return ser.dtype.storage == "pyarrow" and op_name in [ - "cummin", - "cummax", - "cumsum", - ] + return op_name in ["cummin", "cummax", "cumsum"] def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 0dc391db2182b..97f5fb4a9f96f 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -193,13 +193,14 @@ def test_cumprod_timedelta(self): ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), ], ) - def test_cum_methods_pyarrow_strings( - self, pyarrow_string_dtype, data, op, skipna, expected_data + def test_cum_methods_ea_strings( + self, string_dtype_no_object, data, op, skipna, expected_data ): - # https://github.com/pandas-dev/pandas/pull/60633 - ser = pd.Series(data, dtype=pyarrow_string_dtype) + # https://github.com/pandas-dev/pandas/pull/60633 - pyarrow + # https://github.com/pandas-dev/pandas/pull/60938 - Python + ser = pd.Series(data, dtype=string_dtype_no_object) method = getattr(ser, op) - expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + expected = pd.Series(expected_data, dtype=string_dtype_no_object) result = method(skipna=skipna) tm.assert_series_equal(result, expected) From 3ee2ffe9c1b73136a1aea86a89f0ac1ca62924d3 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:21:05 -0800 Subject: [PATCH 392/396] Backport PR #61000 on branch 2.3.x (TST: Change sqlite test query string values to single quotes) (#61001) Backport PR #61000: TST: Change sqlite test query string values to single quotes Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/io/test_sql.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 3676721c5e6b7..89adf18545815 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4255,11 +4255,11 @@ def test_xsqlite_execute_fail(sqlite_buildin): cur.execute(create_sql) with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') - pandas_sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)') + pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)") + pandas_sql.execute("INSERT INTO test VALUES('foo', 'baz', 2.567)") with pytest.raises(sql.DatabaseError, match="Execution failed on sql"): - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 7)') + pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 7)") def test_xsqlite_execute_closed_connection(): @@ -4277,7 +4277,7 @@ def test_xsqlite_execute_closed_connection(): cur.execute(create_sql) with sql.pandasSQL_builder(conn) as pandas_sql: - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') + pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)") msg = "Cannot operate on a closed database." with pytest.raises(sqlite3.ProgrammingError, match=msg): From 9857d31e655e88a73ec864cf2c154c069702aaae Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 3 Mar 2025 18:56:39 -0800 Subject: [PATCH 393/396] Backport PR #61042: CI/TST: Fix xfail in test_columns_dtypes_not_invalid for pyarrow nightly (#61044) * Backport PR #61042: CI/TST: Fix xfail in test_columns_dtypes_not_invalid for pyarrow nightly * remove xfail --- pandas/compat/pyarrow.py | 2 ++ pandas/tests/io/test_parquet.py | 37 +++++++++++++++++++-------------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 81a2d0dc80a10..d78827042e95c 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -19,6 +19,7 @@ pa_version_under17p0 = _palv < Version("17.0.0") pa_version_under18p0 = _palv < Version("18.0.0") pa_version_under19p0 = _palv < Version("19.0.0") + pa_version_under20p0 = _palv < Version("20.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -32,4 +33,5 @@ pa_version_under17p0 = True pa_version_under18p0 = True pa_version_under19p0 = True + pa_version_under20p0 = True HAS_PYARROW = False diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f66ee7dc4367e..45aed8df6d416 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -20,6 +20,7 @@ pa_version_under13p0, pa_version_under15p0, pa_version_under19p0, + pa_version_under20p0, ) import pandas as pd @@ -1103,24 +1104,28 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) - def test_columns_dtypes_not_invalid(self, pa): + @pytest.mark.parametrize( + "columns", + [ + [0, 1], + pytest.param( + [b"foo", b"bar"], + marks=pytest.mark.xfail( + pa_version_under20p0, + raises=NotImplementedError, + reason="https://github.com/apache/arrow/pull/44171", + ), + ), + [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ], + ], + ) + def test_columns_dtypes_not_invalid(self, pa, columns): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) - # numeric - df.columns = [0, 1] - check_round_trip(df, pa) - - # bytes - df.columns = [b"foo", b"bar"] - with pytest.raises(NotImplementedError, match="|S3"): - # Bytes fails on read_parquet - check_round_trip(df, pa) - - # python object - df.columns = [ - datetime.datetime(2011, 1, 1, 0, 0), - datetime.datetime(2011, 1, 1, 1, 1), - ] + df.columns = columns check_round_trip(df, pa) def test_empty_columns(self, pa): From 47bc953e41ebf316a37606e6d2f0e78d40d1dc1d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 12 Mar 2025 09:51:22 -0700 Subject: [PATCH 394/396] Backport PR #61085 on branch 2.3.x (DEPS: Update NumpyExtensionArray repr for NEP51) (#61107) Backport PR #61085: DEPS: Update NumpyExtensionArray repr for NEP51 Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/arrays/numpy_.py | 12 +++++++++ pandas/tests/arrays/numpy_/test_numpy.py | 31 ++++++++++++++++++++++-- 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index db3dcb50bacd0..fc60789801ce7 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -37,6 +37,7 @@ Other enhancements updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`) - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) +- Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`) - The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 07fa6254d87f3..e0031d3db6ca7 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -2,6 +2,7 @@ from typing import ( TYPE_CHECKING, + Any, Literal, ) @@ -29,6 +30,8 @@ from pandas.core.strings.object_array import ObjectStringArrayMixin if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( AxisInt, Dtype, @@ -560,3 +563,12 @@ def _wrap_ndarray_result(self, result: np.ndarray): return TimedeltaArray._simple_new(result, dtype=result.dtype) return type(self)(result) + + def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: + # NEP 51: https://github.com/numpy/numpy/pull/22449 + if self.dtype.kind in "SU": + return "'{}'".format + elif self.dtype == "object": + return repr + else: + return str diff --git a/pandas/tests/arrays/numpy_/test_numpy.py b/pandas/tests/arrays/numpy_/test_numpy.py index 5112ce262f771..f21fb4ccfba07 100644 --- a/pandas/tests/arrays/numpy_/test_numpy.py +++ b/pandas/tests/arrays/numpy_/test_numpy.py @@ -21,7 +21,7 @@ np.array([True, False], dtype=bool), np.array([0, 1], dtype="datetime64[ns]"), np.array([0, 1], dtype="timedelta64[ns]"), - ] + ], ) def any_numpy_array(request): """ @@ -29,7 +29,7 @@ def any_numpy_array(request): This excludes string and bytes. """ - return request.param + return request.param.copy() # ---------------------------------------------------------------------------- @@ -322,3 +322,30 @@ def test_factorize_unsigned(): tm.assert_numpy_array_equal(res_codes, exp_codes) tm.assert_extension_array_equal(res_unique, NumpyExtensionArray(exp_unique)) + + +# ---------------------------------------------------------------------------- +# Output formatting + + +def test_array_repr(any_numpy_array): + # GH#61085 + nparray = any_numpy_array + arr = NumpyExtensionArray(nparray) + if nparray.dtype == "object": + values = "['a', 'b']" + elif nparray.dtype == "float64": + values = "[0.0, 1.0]" + elif str(nparray.dtype).startswith("int"): + values = "[0, 1]" + elif nparray.dtype == "complex128": + values = "[0j, (1+2j)]" + elif nparray.dtype == "bool": + values = "[True, False]" + elif nparray.dtype == "datetime64[ns]": + values = "[1970-01-01T00:00:00.000000000, 1970-01-01T00:00:00.000000001]" + elif nparray.dtype == "timedelta64[ns]": + values = "[0 nanoseconds, 1 nanoseconds]" + expected = f"\n{values}\nLength: 2, dtype: {nparray.dtype}" + result = repr(arr) + assert result == expected, f"{result} vs {expected}" From 00d8d2af6522be68b853a18ca5cde030940c1593 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Mar 2025 08:45:05 -0700 Subject: [PATCH 395/396] Backport PR #61098: CI/TST: Address TestArrowArray::test_reduce_series_numeric supporting skew (#61102) * Backport PR #61098: CI/TST: Address TestArrowArray::test_reduce_series_numeric supporting skew * remove skew from check * Remove kurt too * Add skew to bool skip list * Add boolean * bool fails regardless of skipna --- pandas/compat/__init__.py | 2 ++ pandas/tests/extension/test_arrow.py | 35 +++++++++++++++++++++------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 9b6b1ab3b8909..ff99d6b759d66 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -35,6 +35,7 @@ pa_version_under17p0, pa_version_under18p0, pa_version_under19p0, + pa_version_under20p0, ) if TYPE_CHECKING: @@ -195,6 +196,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under17p0", "pa_version_under18p0", "pa_version_under19p0", + "pa_version_under20p0", "HAS_PYARROW", "IS64", "ISMUSL", diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a63cde8022e24..17fe36c4b4469 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -40,6 +40,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, + pa_version_under20p0, ) from pandas.core.dtypes.dtypes import ( @@ -448,6 +449,9 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques self.check_accumulate(ser, op_name, skipna) def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name == "kurt" or (pa_version_under20p0 and op_name == "skew"): + return False + dtype = ser.dtype # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has # no attribute "pyarrow_dtype" @@ -464,7 +468,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: pass else: return False - elif pa.types.is_binary(pa_dtype) and op_name == "sum": + elif pa.types.is_binary(pa_dtype) and op_name in ["sum", "skew"]: return False elif ( pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) @@ -525,18 +529,31 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, reque f"pyarrow={pa.__version__} for {pa_dtype}" ), ) - if all_numeric_reductions in {"skew", "kurt"} and ( - dtype._is_numeric or dtype.kind == "b" - ): - request.applymarker(xfail_mark) - - elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { + if pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { "sem", "std", "var", "median", }: request.applymarker(xfail_mark) + elif ( + not pa_version_under20p0 + and all_numeric_reductions == "skew" + and ( + pa.types.is_boolean(pa_dtype) + or ( + skipna + and ( + pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype) + ) + ) + ) + ): + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/apache/arrow/issues/45733", + ) + ) super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("skipna", [True, False]) @@ -563,7 +580,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if op_name in ["max", "min"]: cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": - if op_name not in ["median", "var", "std"]: + if op_name not in ["median", "var", "std", "skew"]: cmp_dtype = arr.dtype else: cmp_dtype = "float64[pyarrow]" @@ -582,7 +599,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): op_name = all_numeric_reductions - if op_name == "skew": + if op_name == "skew" and pa_version_under20p0: if data.dtype._is_numeric: mark = pytest.mark.xfail(reason="skew not implemented") request.applymarker(mark) From 9bb152d48a9f6e27f36703d267e757de6e94b196 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 9 Apr 2025 12:48:24 -0700 Subject: [PATCH 396/396] CI: Pin Cython to a specific commit Window PY3.13t builds (#61261) --- scripts/cibw_before_build.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh index 679b91e3280ec..04333f446a7ff 100644 --- a/scripts/cibw_before_build.sh +++ b/scripts/cibw_before_build.sh @@ -7,6 +7,8 @@ done FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" if [[ $FREE_THREADED_BUILD == "True" ]]; then python -m pip install -U pip - python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython - python -m pip install ninja meson-python versioneer[toml] + # python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython + # TODO: Remove below and uncomment above once https://github.com/cython/cython/pull/6717 no longer breaks tests + python -m pip install git+https://github.com/cython/cython.git@3276b588720a053c78488e5de788605950f4b136 + python -m pip install ninja meson-python versioneer[toml] numpy fi