Skip to content

ENH: Implement categorical accessor for arrow arrays #56670

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
10 changes: 10 additions & 0 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
CategoricalDtypeType,
DatetimeTZDtype,
ExtensionDtype,
NumpyEADtype,
Expand Down Expand Up @@ -931,6 +932,15 @@ def assert_series_equal(
pass
else:
assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")
if (
isinstance(left.dtype, pd.ArrowDtype)
and issubclass(left.dtype.type, CategoricalDtypeType)
and isinstance(right.dtype, pd.ArrowDtype)
and issubclass(right.dtype.type, CategoricalDtypeType)
):
assert_extension_array_equal(
left.cat.categories, right.cat.categories, obj="Categories"
)
if check_exact or (
(is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype))
or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype))
Expand Down
164 changes: 163 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
is_scalar,
)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.inference import is_dict_like
from pandas.core.dtypes.missing import isna

from pandas.core import (
Expand Down Expand Up @@ -2117,7 +2118,7 @@ def _replace_with_mask(
cls,
values: pa.Array | pa.ChunkedArray,
mask: npt.NDArray[np.bool_] | bool,
replacements: ArrayLike | Scalar,
replacements: ArrayLike | Scalar | None,
) -> pa.Array | pa.ChunkedArray:
"""
Replace items selected with a mask.
Expand Down Expand Up @@ -2897,6 +2898,167 @@ def _dt_tz_convert(self, tz) -> Self:
result = self._pa_array.cast(pa.timestamp(current_unit, tz))
return type(self)(result)

@functools.cached_property
def _cat_identical_categories(self) -> bool:
base_cats = self._pa_array.chunks[0].dictionary
return all(
pc.all(pc.equal(arr.dictionary, base_cats)).as_py()
for arr in self._pa_array.chunks[1:]
)

@property
def _cat_categories(self):
return type(self)(
pc.unique(
pa.concat_arrays([arr.dictionary for arr in self._pa_array.chunks])
)
)

def _cat_cats_with_identity_check(self):
if self._cat_identical_categories:
return type(self)(self._pa_array.chunks[0].dictionary)
else:
return self._cat_categories

@property
def _cat_ordered(self):
return self._pa_array.type.ordered

def _cat_rename_categories(self, new_categories):
if is_dict_like(new_categories):

def mapper(item):
return new_categories.get(item, item)

new_categories = self._cat_cats_with_identity_check().map(mapper)
elif callable(new_categories):
mapper = new_categories
new_categories = self._cat_cats_with_identity_check().map(mapper)

if not self._cat_identical_categories:
pa_array = pa.chunked_array([self._pa_array.combine_chunks()])
else:
pa_array = self._pa_array
arrs = [
pa.DictionaryArray.from_arrays(arr.indices, new_categories)
for arr in pa_array.chunks
]
return type(self)(pa.chunked_array(arrs))

def _cat_reorder_categories(self, new_categories, ordered=None) -> Self:
if not self._cat_identical_categories:
pa_array = pa.chunked_array([self._pa_array.combine_chunks()])
else:
pa_array = self._pa_array

if len(pa_array.chunks[0].dictionary) != len(new_categories) or not pc.all(
pc.is_in(pa_array.chunks[0].dictionary, pa.array(new_categories))
):
raise ValueError(
"items in new_categories are not the same as in old categories"
)
return self._cat_set_categories(new_categories, ordered=ordered)

def _cat_add_categories(self, new_categories):
chunks = self._pa_array.chunks
new_cats = self._box_pa_array(new_categories, pa_type=chunks[0].dictionary.type)
cats = pa.chunked_array([chunks[-1].dictionary, new_cats]).combine_chunks()
if not self._cat_identical_categories:
chunks[-1] = pa.DictionaryArray.from_arrays(chunks[-1].indices, cats)
else:
chunks = [
pa.DictionaryArray.from_arrays(chunk.indices, cats) for chunk in chunks
]
return type(self)(pa.chunked_array(chunks))

def _cat_remove_categories(self, removals):
removals = pa.array(removals)
chunks = [
self._replace_with_mask(chunk, pc.is_in(chunk, removals), None)
for chunk in self._pa_array.chunks
]
return type(self)(pa.chunked_array(chunks))

def _cat_remove_unused_categories(self):
arr = self._pa_array.combine_chunks()

idx, inv = np.unique(arr.indices.to_numpy(dtype=np.intp), return_inverse=True)

if idx.size != 0 and idx[0] == -1: # na sentinel
idx, inv = idx[1:], inv - 1

new_categories = pc.array_take(arr.dictionary, pa.array(idx))
return type(self)(
pa.chunked_array(
[pa.DictionaryArray.from_arrays(pa.array(inv), new_categories)]
)
)

def _cat_set_categories(self, new_categories, ordered=None, rename=False):
if ordered is None:
ordered = self._cat_ordered

if not self._cat_identical_categories:
pa_array = pa.chunked_array([self._pa_array.combine_chunks()])
else:
pa_array = self._pa_array

cats = pa.array(new_categories)
if rename:
if len(new_categories) >= len(self._cat_cats_with_identity_check()):
chunks = [
pa.DictionaryArray.from_arrays(c.indices, cats, ordered=ordered)
for c in pa_array.chunks
]
return type(self)(pa.chunked_array(chunks))
else:
arr = self._cat_remove_categories(
pa_array.chunks[0].dictionary[len(new_categories) :]
)
return type(self)(
pa.chunked_array(
[
pa.DictionaryArray.from_arrays(
c.indices, c.dictionary, ordered=ordered
)
for c in arr._pa_array.chunks
]
)
)

from pandas.core.arrays.categorical import recode_for_categories
from pandas.core.indexes.base import Index

arrs = []
for chunk in pa_array.chunks:
codes = recode_for_categories(
chunk.indices, chunk.dictionary, Index(new_categories)
)
arrs.append(
pa.DictionaryArray.from_arrays(
codes, new_categories, mask=np.equal(codes, -1), ordered=ordered
)
)
return type(self)(pa.chunked_array(arrs))

def _cat_set_ordered(self, ordered):
return type(self)(
pa.chunked_array(
[
pa.DictionaryArray.from_arrays(
c.indices, c.dictionary, ordered=ordered
)
for c in self._pa_array.chunks
]
)
)

def _cat_as_unordered(self):
return self._cat_set_ordered(False)

def _cat_as_ordered(self):
return self._cat_set_ordered(True)


def transpose_homogeneous_pyarrow(
arrays: Sequence[ArrowExtensionArray],
Expand Down
Loading