Skip to content

API / COW: ensure every new Series/DataFrame also has new (shallow copy) index #53699

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pandas/_libs/internals.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ class BlockManager:
def __init__(
self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=...
) -> None: ...
def get_slice(self, slobj: slice, axis: int = ...) -> Self: ...
def get_slice(
self, slobj: slice, axis: int = ..., using_cow: bool = False
) -> Self: ...
def _rebuild_blknos_and_blklocs(self) -> None: ...

class BlockValuesRefs:
Expand Down
15 changes: 11 additions & 4 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -845,7 +845,7 @@ cdef class BlockManager:
# -------------------------------------------------------------------
# Indexing

cdef BlockManager _slice_mgr_rows(self, slice slobj):
cdef BlockManager _slice_mgr_rows(self, slice slobj, bint using_cow):
cdef:
Block blk, nb
BlockManager mgr
Expand All @@ -856,7 +856,10 @@ cdef class BlockManager:
nb = blk.slice_block_rows(slobj)
nbs.append(nb)

new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
if using_cow:
new_axes = [self.axes[0]._view(), self.axes[1]._getitem_slice(slobj)]
else:
new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False)

# We can avoid having to rebuild blklocs/blknos
Expand All @@ -867,17 +870,21 @@ cdef class BlockManager:
mgr._blklocs = blklocs.copy()
return mgr

def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
def get_slice(
self, slobj: slice, axis: int = 0, using_cow: bool = False
) -> BlockManager:

if axis == 0:
new_blocks = self._slice_take_blocks_ax0(slobj)
elif axis == 1:
return self._slice_mgr_rows(slobj)
return self._slice_mgr_rows(slobj, using_cow)
else:
raise IndexError("Requested axis not found in manager")

new_axes = list(self.axes)
new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
if using_cow:
new_axes[1 - axis] = self.axes[1 - axis]._view()

return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4339,7 +4339,7 @@ def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
"""
assert isinstance(slobj, slice), type(slobj)
axis = self._get_block_manager_axis(axis)
new_mgr = self._mgr.get_slice(slobj, axis=axis)
new_mgr = self._mgr.get_slice(slobj, axis=axis, using_cow=using_copy_on_write())
result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
result = result.__finalize__(self)

Expand Down
4 changes: 3 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,9 @@ def fast_xs(self, loc: int) -> SingleArrayManager:
result = np.array(values, dtype=dtype)
return SingleArrayManager([result], [self._axes[1]])

def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager:
def get_slice(
self, slobj: slice, axis: AxisInt = 0, using_cow: bool = False
) -> ArrayManager:
axis = self._normalize_axis(axis)

if axis == 0:
Expand Down
17 changes: 17 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def blklocs(self) -> npt.NDArray[np.intp]:
def make_empty(self, axes=None) -> Self:
"""return an empty BlockManager with the items axis of len 0"""
if axes is None:
# TODO shallow copy remaining axis?
axes = [Index([])] + self.axes[1:]

# preserve dtype if possible
Expand Down Expand Up @@ -381,6 +382,7 @@ def apply(
applied = getattr(b, f)(**kwargs)
result_blocks = extend_blocks(applied, result_blocks)

# TODO shallow copy axes (in from_blocks or here?)
out = type(self).from_blocks(result_blocks, self.axes)
return out

Expand Down Expand Up @@ -539,6 +541,7 @@ def get_numeric_data(self, copy: bool = False) -> Self:
# Avoid somewhat expensive _combine
if copy:
return self.copy(deep=True)
# TODO(CoW) need to return a shallow copy here?
return self
return self._combine(numeric_blocks, copy)

Expand Down Expand Up @@ -570,6 +573,7 @@ def _combine(
new_blocks.append(nb)

axes = list(self.axes)
# TODO shallow copy of axes?
if index is not None:
axes[-1] = index
axes[0] = self.items.take(indexer)
Expand Down Expand Up @@ -641,6 +645,7 @@ def consolidate(self) -> Self:
if self.is_consolidated():
return self

# TODO shallow copy is not needed here?
bm = type(self)(self.blocks, self.axes, verify_integrity=False)
bm._is_consolidated = False
bm._consolidate_inplace()
Expand Down Expand Up @@ -685,6 +690,7 @@ def reindex_indexer(

if indexer is None:
if new_axis is self.axes[axis] and not copy:
# TODO(CoW) need to handle CoW?
return self

result = self.copy(deep=copy)
Expand Down Expand Up @@ -723,6 +729,8 @@ def reindex_indexer(

new_axes = list(self.axes)
new_axes[axis] = new_axis
if self.ndim == 2 and using_copy_on_write():
new_axes[1 - axis] = self.axes[1 - axis]._view()

new_mgr = type(self).from_blocks(new_blocks, new_axes)
if axis == 1:
Expand Down Expand Up @@ -1005,6 +1013,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
ndim=1,
refs=self.blocks[0].refs,
)
# TODO shallow copy columns
return SingleBlockManager(block, self.axes[0])

dtype = interleaved_dtype([blk.dtype for blk in self.blocks])
Expand Down Expand Up @@ -1033,6 +1042,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager:

bp = BlockPlacement(slice(0, len(result)))
block = new_block(result, placement=bp, ndim=1)
# TODO shallow copy columns
return SingleBlockManager(block, self.axes[0])

def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
Expand All @@ -1047,6 +1057,7 @@ def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
nb = type(block)(
values, placement=bp, ndim=1, refs=block.refs if track_ref else None
)
# TODO shallow copy index? (might already be done where this gets called)
return SingleBlockManager(nb, self.axes[1])

def iget_values(self, i: int) -> ArrayLike:
Expand Down Expand Up @@ -1447,6 +1458,7 @@ def idelete(self, indexer) -> BlockManager:

nbs = self._slice_take_blocks_ax0(taker, only_slice=True, ref_inplace_op=True)
new_columns = self.items[~is_deleted]
# TODO shallow copy index?
axes = [new_columns, self.axes[1]]
return type(self)(tuple(nbs), axes, verify_integrity=False)

Expand Down Expand Up @@ -1484,6 +1496,7 @@ def grouped_reduce(self, func: Callable) -> Self:
nrows = result_blocks[0].values.shape[-1]
index = Index(range(nrows))

# TODO shallow copy columns?
return type(self).from_blocks(result_blocks, [self.axes[0], index])

def reduce(self, func: Callable) -> Self:
Expand All @@ -1507,6 +1520,7 @@ def reduce(self, func: Callable) -> Self:
res_blocks.extend(nbs)

index = Index([None]) # placeholder
# TODO shallow copy self.items
new_mgr = type(self).from_blocks(res_blocks, [self.items, index])
return new_mgr

Expand Down Expand Up @@ -1548,6 +1562,7 @@ def quantile(
assert self.ndim >= 2
assert is_list_like(qs) # caller is responsible for this

# TODO shallow copy axes
new_axes = list(self.axes)
new_axes[1] = Index(qs, dtype=np.float64)

Expand Down Expand Up @@ -1820,6 +1835,7 @@ def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:

offset += len(mgr.items)

# TODO relevant axis already shallow-copied at caller?
new_mgr = cls(tuple(blocks), axes)
return new_mgr

Expand Down Expand Up @@ -1889,6 +1905,7 @@ def to_2d_mgr(self, columns: Index) -> BlockManager:
arr = ensure_block_shape(blk.values, ndim=2)
bp = BlockPlacement(0)
new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs)
# TODO shallow copy index
axes = [columns, self.axes[0]]
return BlockManager([new_blk], axes=axes, verify_integrity=False)

Expand Down
1 change: 1 addition & 0 deletions pandas/core/internals/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def operate_blockwise(
# assert len(slocs) == nlocs, (len(slocs), nlocs)
# assert slocs == set(range(nlocs)), slocs

# TODO shallow copy axes?
new_mgr = type(right)(tuple(res_blks), axes=right.axes, verify_integrity=False)
return new_mgr

Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def test_subset_column_selection(backend, using_copy_on_write):

subset = df[["a", "c"]]

if using_copy_on_write:
assert subset.index is not df.index

if using_copy_on_write:
# the subset shares memory ...
assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
Expand Down Expand Up @@ -111,6 +114,9 @@ def test_subset_row_slice(backend, using_copy_on_write):
subset = df[1:3]
subset._mgr._verify_integrity()

if using_copy_on_write:
assert subset.columns is not df.columns

assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))

if using_copy_on_write:
Expand Down Expand Up @@ -156,6 +162,9 @@ def test_subset_column_slice(
subset = df.iloc[:, 1:]
subset._mgr._verify_integrity()

if using_copy_on_write:
assert subset.index is not df.index

if using_copy_on_write:
assert np.shares_memory(get_array(subset, "b"), get_array(df, "b"))

Expand Down Expand Up @@ -219,6 +228,10 @@ def test_subset_loc_rows_columns(

subset = df.loc[row_indexer, column_indexer]

if using_copy_on_write:
assert subset.index is not df.index
assert subset.columns is not df.columns

# a few corner cases _do_ actually modify the parent (with both row and column
# slice, and in case of ArrayManager or BlockManager with single block)
mutate_parent = (
Expand Down Expand Up @@ -283,6 +296,10 @@ def test_subset_iloc_rows_columns(

subset = df.iloc[row_indexer, column_indexer]

if using_copy_on_write:
assert subset.index is not df.index
assert subset.columns is not df.columns

# a few corner cases _do_ actually modify the parent (with both row and column
# slice, and in case of ArrayManager or BlockManager with single block)
mutate_parent = (
Expand Down Expand Up @@ -761,6 +778,10 @@ def test_null_slice(backend, method, using_copy_on_write, warn_copy_on_write):

df2 = method(df)

if using_copy_on_write:
assert df2.index is not df.index
assert df2.columns is not df.columns

# we always return new objects (shallow copy), regardless of CoW or not
assert df2 is not df

Expand Down Expand Up @@ -790,6 +811,9 @@ def test_null_slice_series(backend, method, using_copy_on_write, warn_copy_on_wr

s2 = method(s)

if using_copy_on_write:
assert s2.index is not s.index

# we always return new objects, regardless of CoW or not
assert s2 is not s

Expand Down Expand Up @@ -947,6 +971,9 @@ def test_column_as_series(

s = df["a"]

if using_copy_on_write:
assert s.index is not df.index

assert np.shares_memory(get_array(s, "a"), get_array(df, "a"))

if using_copy_on_write or using_array_manager:
Expand Down Expand Up @@ -1043,6 +1070,10 @@ def test_column_as_series_no_item_cache(
s1 = method(df)
s2 = method(df)

if using_copy_on_write:
assert s1.index is not df.index
assert s1.index is not s2.index

is_iloc = "iloc" in request.node.name
if using_copy_on_write or warn_copy_on_write or is_iloc:
assert s1 is not s2
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def test_copy(using_copy_on_write):
assert not df_copy._mgr.blocks[0].refs.has_reference()
assert not df_copy._mgr.blocks[1].refs.has_reference()

assert df_copy.index is not df.index
assert df_copy.columns is not df.columns

# mutating copy doesn't mutate original
df_copy.iloc[0, 0] = 0
assert df.iloc[0, 0] == 1
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/series/methods/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ def test_align_same_index(datetime_series, using_copy_on_write):
assert a.index is datetime_series.index
assert b.index is datetime_series.index
else:
assert a.index is not datetime_series.index
assert b.index is not datetime_series.index
assert a.index.is_(datetime_series.index)
assert b.index.is_(datetime_series.index)

Expand Down