feat: add subset parameter to DataFrame.dropna to select which columns to consider (#981)

tswast · Genesis929 · web-flow · commit f7c03dcaf7ee · 2024-09-16T14:22:26.000-05:00
* feat: add `subset` parameter to `DataFrame.dropna` to select which columns to consider * fix dropna with subset=None * refactor: remove circular dependencies preventing local doctest runs With this change I can once again run ``` pytest --doctest-modules third_party/bigframes_vendored/pandas/core/frame.py ``` Note: having multiple `version.py` files should be fine. release-please will update all such files it finds. * fix doctest * Revert "Merge branch 'tswast-circular-import' into b366248570-dropna-subset" This reverts commit 57e8335, reversing changes made to 197074a. * Reapply "Merge branch 'tswast-circular-import' into b366248570-dropna-subset" This reverts commit 0f18294. * loop over tuple result --------- Co-authored-by: Huan Chen <142538604+Genesis929@users.noreply.github.com>
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -15,7 +15,7 @@
 
 import functools
 import typing
-from typing import Sequence
+from typing import Optional, Sequence
 
 import bigframes_vendored.constants as constants
 import pandas as pd
@@ -488,11 +488,19 @@ def dropna(
     block: blocks.Block,
     column_ids: typing.Sequence[str],
     how: typing.Literal["all", "any"] = "any",
+    subset: Optional[typing.Sequence[str]] = None,
 ):
     """
     Drop na entries from block
     """
-    predicates = [ops.notnull_op.as_expr(column_id) for column_id in column_ids]
+    if subset is None:
+        subset = column_ids
+
+    predicates = [
+        ops.notnull_op.as_expr(column_id)
+        for column_id in column_ids
+        if column_id in subset
+    ]
     if len(predicates) == 0:
         return block
     if how == "any":
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2027,8 +2027,9 @@ def dropna(
         self,
         *,
         axis: int | str = 0,
-        inplace: bool = False,
         how: str = "any",
+        subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None,
+        inplace: bool = False,
         ignore_index=False,
     ) -> DataFrame:
         if inplace:
@@ -2040,8 +2041,25 @@ def dropna(
 
         axis_n = utils.get_axis_number(axis)
 
+        if subset is not None and axis_n != 0:
+            raise NotImplementedError(
+                f"subset only supported when axis=0. {constants.FEEDBACK_LINK}"
+            )
+
         if axis_n == 0:
-            result = block_ops.dropna(self._block, self._block.value_columns, how=how)  # type: ignore
+            # subset needs to be converted into column IDs, not column labels.
+            if subset is None:
+                subset_ids = None
+            elif not utils.is_list_like(subset):
+                subset_ids = [id_ for id_ in self._block.label_to_col_id[subset]]
+            else:
+                subset_ids = [
+                    id_
+                    for label in subset
+                    for id_ in self._block.label_to_col_id[label]
+                ]
+
+            result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids)  # type: ignore
             if ignore_index:
                 result = result.reset_index()
             return DataFrame(result)
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -936,19 +936,24 @@ def test_assign_callable_lambda(scalars_dfs):
 
 @skip_legacy_pandas
 @pytest.mark.parametrize(
-    ("axis", "how", "ignore_index"),
+    ("axis", "how", "ignore_index", "subset"),
     [
-        (0, "any", False),
-        (0, "any", True),
-        (1, "any", False),
-        (1, "all", False),
+        (0, "any", False, None),
+        (0, "any", True, None),
+        (0, "all", False, ["bool_col", "time_col"]),
+        (0, "any", False, ["bool_col", "time_col"]),
+        (0, "all", False, "time_col"),
+        (1, "any", False, None),
+        (1, "all", False, None),
     ],
 )
-def test_df_dropna(scalars_dfs, axis, how, ignore_index):
+def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset):
     scalars_df, scalars_pandas_df = scalars_dfs
-    df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index)
+    df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset)
     bf_result = df.to_pandas()
-    pd_result = scalars_pandas_df.dropna(axis=axis, how=how, ignore_index=ignore_index)
+    pd_result = scalars_pandas_df.dropna(
+        axis=axis, how=how, ignore_index=ignore_index, subset=subset
+    )
 
     # Pandas uses int64 instead of Int64 (nullable) dtype.
     pd_result.index = pd_result.index.astype(pd.Int64Dtype())
diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py
@@ -20,6 +20,15 @@
 from . import resources
 
 
+def test_dataframe_dropna_axis_1_subset_not_implememented(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    dataframe = resources.create_dataframe(monkeypatch)
+
+    with pytest.raises(NotImplementedError, match="subset"):
+        dataframe.dropna(axis=1, subset=["col1", "col2"])
+
+
 def test_dataframe_repr_with_uninitialized_object():
     """Ensures DataFrame.__init__ can be paused in a visual debugger without crashing.
 
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -1614,6 +1614,8 @@ def dropna(
         *,
         axis: int | str = 0,
         how: str = "any",
+        subset=None,
+        inplace: bool = False,
         ignore_index=False,
     ) -> DataFrame:
         """Remove missing values.
@@ -1662,6 +1664,15 @@ def dropna(
             <BLANKLINE>
             [3 rows x 3 columns]
 
+        Define in which columns to look for missing values.
+
+            >>> df.dropna(subset=['name', 'toy'])
+                   name        toy        born
+            1    Batman  Batmobile  1940-04-25
+            2  Catwoman   Bullwhip        <NA>
+            <BLANKLINE>
+            [2 rows x 3 columns]
+
         Args:
             axis ({0 or 'index', 1 or 'columns'}, default 'columns'):
                 Determine if rows or columns which contain missing values are
@@ -1675,6 +1686,12 @@ def dropna(
 
                 * 'any' : If any NA values are present, drop that row or column.
                 * 'all' : If all values are NA, drop that row or column.
+            subset (column label or sequence of labels, optional):
+                Labels along other axis to consider, e.g. if you are dropping
+                rows these would be a list of columns to include.
+                Only supports axis=0.
+            inplace (bool, default ``False``):
+                Not supported.
             ignore_index (bool, default ``False``):
                 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.