pandas-dev · phofl · Dec 28, 2023 · Dec 28, 2023 · Dec 28, 2023 · Jan 6, 2024
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -25,6 +25,7 @@
 )
 from pandas.core.dtypes.dtypes import (
     CategoricalDtype,
+    CategoricalDtypeType,
     DatetimeTZDtype,
     ExtensionDtype,
     NumpyEADtype,
@@ -931,6 +932,15 @@ def assert_series_equal(
             pass
         else:
             assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")
+        if (
+            isinstance(left.dtype, pd.ArrowDtype)
+            and issubclass(left.dtype.type, CategoricalDtypeType)
+            and isinstance(right.dtype, pd.ArrowDtype)
+            and issubclass(right.dtype.type, CategoricalDtypeType)
+        ):
+            assert_extension_array_equal(
+                left.cat.categories, right.cat.categories, obj="Categories"
+            )
     if check_exact or (
         (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype))
         or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype))

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -46,6 +46,7 @@
     is_scalar,
 )
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
+from pandas.core.dtypes.inference import is_dict_like
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import (
@@ -2117,7 +2118,7 @@ def _replace_with_mask(
         cls,
         values: pa.Array | pa.ChunkedArray,
         mask: npt.NDArray[np.bool_] | bool,
-        replacements: ArrayLike | Scalar,
+        replacements: ArrayLike | Scalar | None,
     ) -> pa.Array | pa.ChunkedArray:
         """
         Replace items selected with a mask.
@@ -2897,6 +2898,167 @@ def _dt_tz_convert(self, tz) -> Self:
         result = self._pa_array.cast(pa.timestamp(current_unit, tz))
         return type(self)(result)
 
+    @functools.cached_property
+    def _cat_identical_categories(self) -> bool:
+        base_cats = self._pa_array.chunks[0].dictionary
+        return all(
+            pc.all(pc.equal(arr.dictionary, base_cats)).as_py()
+            for arr in self._pa_array.chunks[1:]
+        )
+
+    @property
+    def _cat_categories(self):
+        return type(self)(
+            pc.unique(
+                pa.concat_arrays([arr.dictionary for arr in self._pa_array.chunks])
+            )
+        )
+
+    def _cat_cats_with_identity_check(self):
+        if self._cat_identical_categories:
+            return type(self)(self._pa_array.chunks[0].dictionary)
+        else:
+            return self._cat_categories
+
+    @property
+    def _cat_ordered(self):
+        return self._pa_array.type.ordered
+
+    def _cat_rename_categories(self, new_categories):
+        if is_dict_like(new_categories):
+
+            def mapper(item):
+                return new_categories.get(item, item)
+
+            new_categories = self._cat_cats_with_identity_check().map(mapper)
+        elif callable(new_categories):
+            mapper = new_categories
+            new_categories = self._cat_cats_with_identity_check().map(mapper)
+
+        if not self._cat_identical_categories:
+            pa_array = pa.chunked_array([self._pa_array.combine_chunks()])
+        else:
+            pa_array = self._pa_array
+        arrs = [
+            pa.DictionaryArray.from_arrays(arr.indices, new_categories)
+            for arr in pa_array.chunks
+        ]
+        return type(self)(pa.chunked_array(arrs))
+
+    def _cat_reorder_categories(self, new_categories, ordered=None) -> Self:
+        if not self._cat_identical_categories:
+            pa_array = pa.chunked_array([self._pa_array.combine_chunks()])
+        else:
+            pa_array = self._pa_array
+
+        if len(pa_array.chunks[0].dictionary) != len(new_categories) or not pc.all(
+            pc.is_in(pa_array.chunks[0].dictionary, pa.array(new_categories))
+        ):
+            raise ValueError(
+                "items in new_categories are not the same as in old categories"
+            )
+        return self._cat_set_categories(new_categories, ordered=ordered)
+
+    def _cat_add_categories(self, new_categories):
+        chunks = self._pa_array.chunks
+        new_cats = self._box_pa_array(new_categories, pa_type=chunks[0].dictionary.type)
+        cats = pa.chunked_array([chunks[-1].dictionary, new_cats]).combine_chunks()
+        if not self._cat_identical_categories:
+            chunks[-1] = pa.DictionaryArray.from_arrays(chunks[-1].indices, cats)
+        else:
+            chunks = [
+                pa.DictionaryArray.from_arrays(chunk.indices, cats) for chunk in chunks
+            ]
+        return type(self)(pa.chunked_array(chunks))
+
+    def _cat_remove_categories(self, removals):
+        removals = pa.array(removals)
+        chunks = [
+            self._replace_with_mask(chunk, pc.is_in(chunk, removals), None)
+            for chunk in self._pa_array.chunks
+        ]
+        return type(self)(pa.chunked_array(chunks))
+
+    def _cat_remove_unused_categories(self):
+        arr = self._pa_array.combine_chunks()
+
+        idx, inv = np.unique(arr.indices.to_numpy(dtype=np.intp), return_inverse=True)
+
+        if idx.size != 0 and idx[0] == -1:  # na sentinel
+            idx, inv = idx[1:], inv - 1
+
+        new_categories = pc.array_take(arr.dictionary, pa.array(idx))
+        return type(self)(
+            pa.chunked_array(
+                [pa.DictionaryArray.from_arrays(pa.array(inv), new_categories)]
+            )
+        )
+
+    def _cat_set_categories(self, new_categories, ordered=None, rename=False):
+        if ordered is None:
+            ordered = self._cat_ordered
+
+        if not self._cat_identical_categories:
+            pa_array = pa.chunked_array([self._pa_array.combine_chunks()])
+        else:
+            pa_array = self._pa_array
+
+        cats = pa.array(new_categories)
+        if rename:
+            if len(new_categories) >= len(self._cat_cats_with_identity_check()):
+                chunks = [
+                    pa.DictionaryArray.from_arrays(c.indices, cats, ordered=ordered)
+                    for c in pa_array.chunks
+                ]
+                return type(self)(pa.chunked_array(chunks))
+            else:
+                arr = self._cat_remove_categories(
+                    pa_array.chunks[0].dictionary[len(new_categories) :]
+                )
+                return type(self)(
+                    pa.chunked_array(
+                        [
+                            pa.DictionaryArray.from_arrays(
+                                c.indices, c.dictionary, ordered=ordered
+                            )
+                            for c in arr._pa_array.chunks
+                        ]
+                    )
+                )
+
+        from pandas.core.arrays.categorical import recode_for_categories
+        from pandas.core.indexes.base import Index
+
+        arrs = []
+        for chunk in pa_array.chunks:
+            codes = recode_for_categories(
+                chunk.indices, chunk.dictionary, Index(new_categories)
+            )
+            arrs.append(
+                pa.DictionaryArray.from_arrays(
+                    codes, new_categories, mask=np.equal(codes, -1), ordered=ordered
+                )
+            )
+        return type(self)(pa.chunked_array(arrs))
+
+    def _cat_set_ordered(self, ordered):
+        return type(self)(
+            pa.chunked_array(
+                [
+                    pa.DictionaryArray.from_arrays(
+                        c.indices, c.dictionary, ordered=ordered
+                    )
+                    for c in self._pa_array.chunks
+                ]
+            )
+        )
+
+    def _cat_as_unordered(self):
+        return self._cat_set_ordered(False)
+
+    def _cat_as_ordered(self):
+        return self._cat_set_ordered(True)
+
 
 def transpose_homogeneous_pyarrow(
     arrays: Sequence[ArrowExtensionArray],