Skip to content

Commit f7c03dc

Browse files
tswastGenesis929
andauthored
feat: add subset parameter to DataFrame.dropna to select which columns to consider (#981)
* feat: add `subset` parameter to `DataFrame.dropna` to select which columns to consider * fix dropna with subset=None * refactor: remove circular dependencies preventing local doctest runs With this change I can once again run ``` pytest --doctest-modules third_party/bigframes_vendored/pandas/core/frame.py ``` Note: having multiple `version.py` files should be fine. release-please will update all such files it finds. * fix doctest * Revert "Merge branch 'tswast-circular-import' into b366248570-dropna-subset" This reverts commit 57e8335, reversing changes made to 197074a. * Reapply "Merge branch 'tswast-circular-import' into b366248570-dropna-subset" This reverts commit 0f18294. * loop over tuple result --------- Co-authored-by: Huan Chen <[email protected]>
1 parent deac6d2 commit f7c03dc

File tree

5 files changed

+69
-12
lines changed

5 files changed

+69
-12
lines changed

bigframes/core/block_transforms.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
import functools
1717
import typing
18-
from typing import Sequence
18+
from typing import Optional, Sequence
1919

2020
import bigframes_vendored.constants as constants
2121
import pandas as pd
@@ -488,11 +488,19 @@ def dropna(
488488
block: blocks.Block,
489489
column_ids: typing.Sequence[str],
490490
how: typing.Literal["all", "any"] = "any",
491+
subset: Optional[typing.Sequence[str]] = None,
491492
):
492493
"""
493494
Drop na entries from block
494495
"""
495-
predicates = [ops.notnull_op.as_expr(column_id) for column_id in column_ids]
496+
if subset is None:
497+
subset = column_ids
498+
499+
predicates = [
500+
ops.notnull_op.as_expr(column_id)
501+
for column_id in column_ids
502+
if column_id in subset
503+
]
496504
if len(predicates) == 0:
497505
return block
498506
if how == "any":

bigframes/dataframe.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2027,8 +2027,9 @@ def dropna(
20272027
self,
20282028
*,
20292029
axis: int | str = 0,
2030-
inplace: bool = False,
20312030
how: str = "any",
2031+
subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None,
2032+
inplace: bool = False,
20322033
ignore_index=False,
20332034
) -> DataFrame:
20342035
if inplace:
@@ -2040,8 +2041,25 @@ def dropna(
20402041

20412042
axis_n = utils.get_axis_number(axis)
20422043

2044+
if subset is not None and axis_n != 0:
2045+
raise NotImplementedError(
2046+
f"subset only supported when axis=0. {constants.FEEDBACK_LINK}"
2047+
)
2048+
20432049
if axis_n == 0:
2044-
result = block_ops.dropna(self._block, self._block.value_columns, how=how) # type: ignore
2050+
# subset needs to be converted into column IDs, not column labels.
2051+
if subset is None:
2052+
subset_ids = None
2053+
elif not utils.is_list_like(subset):
2054+
subset_ids = [id_ for id_ in self._block.label_to_col_id[subset]]
2055+
else:
2056+
subset_ids = [
2057+
id_
2058+
for label in subset
2059+
for id_ in self._block.label_to_col_id[label]
2060+
]
2061+
2062+
result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore
20452063
if ignore_index:
20462064
result = result.reset_index()
20472065
return DataFrame(result)

tests/system/small/test_dataframe.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -936,19 +936,24 @@ def test_assign_callable_lambda(scalars_dfs):
936936

937937
@skip_legacy_pandas
938938
@pytest.mark.parametrize(
939-
("axis", "how", "ignore_index"),
939+
("axis", "how", "ignore_index", "subset"),
940940
[
941-
(0, "any", False),
942-
(0, "any", True),
943-
(1, "any", False),
944-
(1, "all", False),
941+
(0, "any", False, None),
942+
(0, "any", True, None),
943+
(0, "all", False, ["bool_col", "time_col"]),
944+
(0, "any", False, ["bool_col", "time_col"]),
945+
(0, "all", False, "time_col"),
946+
(1, "any", False, None),
947+
(1, "all", False, None),
945948
],
946949
)
947-
def test_df_dropna(scalars_dfs, axis, how, ignore_index):
950+
def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset):
948951
scalars_df, scalars_pandas_df = scalars_dfs
949-
df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index)
952+
df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset)
950953
bf_result = df.to_pandas()
951-
pd_result = scalars_pandas_df.dropna(axis=axis, how=how, ignore_index=ignore_index)
954+
pd_result = scalars_pandas_df.dropna(
955+
axis=axis, how=how, ignore_index=ignore_index, subset=subset
956+
)
952957

953958
# Pandas uses int64 instead of Int64 (nullable) dtype.
954959
pd_result.index = pd_result.index.astype(pd.Int64Dtype())

tests/unit/test_dataframe.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@
2020
from . import resources
2121

2222

23+
def test_dataframe_dropna_axis_1_subset_not_implememented(
24+
monkeypatch: pytest.MonkeyPatch,
25+
):
26+
dataframe = resources.create_dataframe(monkeypatch)
27+
28+
with pytest.raises(NotImplementedError, match="subset"):
29+
dataframe.dropna(axis=1, subset=["col1", "col2"])
30+
31+
2332
def test_dataframe_repr_with_uninitialized_object():
2433
"""Ensures DataFrame.__init__ can be paused in a visual debugger without crashing.
2534

third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1614,6 +1614,8 @@ def dropna(
16141614
*,
16151615
axis: int | str = 0,
16161616
how: str = "any",
1617+
subset=None,
1618+
inplace: bool = False,
16171619
ignore_index=False,
16181620
) -> DataFrame:
16191621
"""Remove missing values.
@@ -1662,6 +1664,15 @@ def dropna(
16621664
<BLANKLINE>
16631665
[3 rows x 3 columns]
16641666
1667+
Define in which columns to look for missing values.
1668+
1669+
>>> df.dropna(subset=['name', 'toy'])
1670+
name toy born
1671+
1 Batman Batmobile 1940-04-25
1672+
2 Catwoman Bullwhip <NA>
1673+
<BLANKLINE>
1674+
[2 rows x 3 columns]
1675+
16651676
Args:
16661677
axis ({0 or 'index', 1 or 'columns'}, default 'columns'):
16671678
Determine if rows or columns which contain missing values are
@@ -1675,6 +1686,12 @@ def dropna(
16751686
16761687
* 'any' : If any NA values are present, drop that row or column.
16771688
* 'all' : If all values are NA, drop that row or column.
1689+
subset (column label or sequence of labels, optional):
1690+
Labels along other axis to consider, e.g. if you are dropping
1691+
rows these would be a list of columns to include.
1692+
Only supports axis=0.
1693+
inplace (bool, default ``False``):
1694+
Not supported.
16781695
ignore_index (bool, default ``False``):
16791696
If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
16801697

0 commit comments

Comments
 (0)