diff --git a/CHANGELOG.md b/CHANGELOG.md index 433956da3d..fdd060f1f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,21 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.19.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.18.0...v2.19.0) (2025-09-09) + + +### Features + +* Add str.join method ([#2054](https://github.com/googleapis/python-bigquery-dataframes/issues/2054)) ([8804ada](https://github.com/googleapis/python-bigquery-dataframes/commit/8804adaf8ba23fdcad6e42a7bf034bd0a11c890f)) +* Support display.max_colwidth option ([#2053](https://github.com/googleapis/python-bigquery-dataframes/issues/2053)) ([5229e07](https://github.com/googleapis/python-bigquery-dataframes/commit/5229e07b4535c01b0cdbd731455ff225a373b5c8)) +* Support VPC egress setting in remote function ([#2059](https://github.com/googleapis/python-bigquery-dataframes/issues/2059)) ([5df779d](https://github.com/googleapis/python-bigquery-dataframes/commit/5df779d4f421d3ba777cfd928d99ca2e8a3f79ad)) + + +### Bug Fixes + +* Fix issue mishandling chunked array while loading data ([#2051](https://github.com/googleapis/python-bigquery-dataframes/issues/2051)) ([873d0ee](https://github.com/googleapis/python-bigquery-dataframes/commit/873d0eee474ed34f1d5164c37383f2737dbec4db)) +* Remove warning for slot_millis_sum ([#2047](https://github.com/googleapis/python-bigquery-dataframes/issues/2047)) ([425a691](https://github.com/googleapis/python-bigquery-dataframes/commit/425a6917d5442eeb4df486c6eed1fd136bbcedfb)) + ## [2.18.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.17.0...v2.18.0) (2025-09-03) diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py index 360292dd80..b7ce29e47e 100644 --- a/bigframes/_config/display_options.py +++ b/bigframes/_config/display_options.py @@ -35,6 +35,7 @@ class DisplayOptions: progress_bar: Optional[str] = "auto" repr_mode: Literal["head", "deferred", "anywidget"] = "head" + max_colwidth: Optional[int] = 50 max_info_columns: int = 100 max_info_rows: Optional[int] = 200000 memory_usage: bool = True @@ -52,6 +53,8 @@ def pandas_repr(display_options: DisplayOptions): so that we don't override pandas behavior. """ with pd.option_context( + "display.max_colwidth", + display_options.max_colwidth, "display.max_columns", display_options.max_columns, "display.max_rows", diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index dbaea57005..32412648d6 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -16,6 +16,8 @@ such as array functions: https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """ +import sys + from bigframes.bigquery._operations.approx_agg import approx_top_count from bigframes.bigquery._operations.array import ( array_agg, @@ -52,43 +54,51 @@ from bigframes.bigquery._operations.search import create_vector_index, vector_search from bigframes.bigquery._operations.sql import sql_scalar from bigframes.bigquery._operations.struct import struct +from bigframes.core import log_adapter -__all__ = [ +_functions = [ # approximate aggregate ops - "approx_top_count", + approx_top_count, # array ops - "array_agg", - "array_length", - "array_to_string", + array_agg, + array_length, + array_to_string, # datetime ops - "unix_micros", - "unix_millis", - "unix_seconds", + unix_micros, + unix_millis, + unix_seconds, # geo ops - "st_area", - "st_buffer", - "st_centroid", - "st_convexhull", - "st_difference", - "st_distance", - "st_intersection", - "st_isclosed", - "st_length", + st_area, + st_buffer, + st_centroid, + st_convexhull, + st_difference, + st_distance, + st_intersection, + st_isclosed, + st_length, # json ops - "json_extract", - "json_extract_array", - "json_extract_string_array", - "json_query", - "json_query_array", - "json_set", - "json_value", - "json_value_array", - "parse_json", + json_extract, + json_extract_array, + json_extract_string_array, + json_query, + json_query_array, + json_set, + json_value, + json_value_array, + parse_json, # search ops - "create_vector_index", - "vector_search", + create_vector_index, + vector_search, # sql ops - "sql_scalar", + sql_scalar, # struct ops - "struct", + struct, ] + +__all__ = [f.__name__ for f in _functions] + +_module = sys.modules[__name__] +for f in _functions: + _decorated_object = log_adapter.method_logger(f, custom_base_name="bigquery") + setattr(_module, f.__name__, _decorated_object) diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py index 5063fc9118..c16c2af1a9 100644 --- a/bigframes/bigquery/_operations/search.py +++ b/bigframes/bigquery/_operations/search.py @@ -20,7 +20,6 @@ import google.cloud.bigquery as bigquery -import bigframes.core.sql import bigframes.ml.utils as utils if typing.TYPE_CHECKING: diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index a84c074e01..a2de61fc21 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -21,8 +21,6 @@ import google.cloud.bigquery import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir -import bigframes.core.sql -import bigframes.dataframe import bigframes.dtypes import bigframes.operations import bigframes.series diff --git a/bigframes/core/agg_expressions.py b/bigframes/core/agg_expressions.py new file mode 100644 index 0000000000..f77525706b --- /dev/null +++ b/bigframes/core/agg_expressions.py @@ -0,0 +1,151 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import abc +import dataclasses +import functools +import itertools +import typing +from typing import Callable, Mapping, TypeVar + +from bigframes import dtypes +from bigframes.core import expression +import bigframes.core.identifiers as ids +import bigframes.operations.aggregations as agg_ops + +TExpression = TypeVar("TExpression", bound="Aggregation") + + +@dataclasses.dataclass(frozen=True) +class Aggregation(expression.Expression): + """Represents windowing or aggregation over a column.""" + + op: agg_ops.WindowOp = dataclasses.field() + + @property + def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: + return tuple( + itertools.chain.from_iterable( + map(lambda x: x.column_references, self.inputs) + ) + ) + + @functools.cached_property + def is_resolved(self) -> bool: + return all(input.is_resolved for input in self.inputs) + + @functools.cached_property + def output_type(self) -> dtypes.ExpressionType: + if not self.is_resolved: + raise ValueError(f"Type of expression {self.op} has not been fixed.") + + input_types = [input.output_type for input in self.inputs] + + return self.op.output_type(*input_types) + + @property + @abc.abstractmethod + def inputs( + self, + ) -> typing.Tuple[expression.Expression, ...]: + ... + + @property + def free_variables(self) -> typing.Tuple[str, ...]: + return tuple( + itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs)) + ) + + @property + def is_const(self) -> bool: + return all(child.is_const for child in self.inputs) + + @abc.abstractmethod + def replace_args(self: TExpression, *arg) -> TExpression: + ... + + def transform_children( + self: TExpression, t: Callable[[expression.Expression], expression.Expression] + ) -> TExpression: + return self.replace_args(*(t(arg) for arg in self.inputs)) + + def bind_variables( + self: TExpression, + bindings: Mapping[str, expression.Expression], + allow_partial_bindings: bool = False, + ) -> TExpression: + return self.transform_children( + lambda x: x.bind_variables(bindings, allow_partial_bindings) + ) + + def bind_refs( + self: TExpression, + bindings: Mapping[ids.ColumnId, expression.Expression], + allow_partial_bindings: bool = False, + ) -> TExpression: + return self.transform_children( + lambda x: x.bind_refs(bindings, allow_partial_bindings) + ) + + +@dataclasses.dataclass(frozen=True) +class NullaryAggregation(Aggregation): + op: agg_ops.NullaryWindowOp = dataclasses.field() + + @property + def inputs( + self, + ) -> typing.Tuple[expression.Expression, ...]: + return () + + def replace_args(self, *arg) -> NullaryAggregation: + return self + + +@dataclasses.dataclass(frozen=True) +class UnaryAggregation(Aggregation): + op: agg_ops.UnaryWindowOp + arg: expression.Expression + + @property + def inputs( + self, + ) -> typing.Tuple[expression.Expression, ...]: + return (self.arg,) + + def replace_args(self, arg: expression.Expression) -> UnaryAggregation: + return UnaryAggregation( + self.op, + arg, + ) + + +@dataclasses.dataclass(frozen=True) +class BinaryAggregation(Aggregation): + op: agg_ops.BinaryAggregateOp = dataclasses.field() + left: expression.Expression = dataclasses.field() + right: expression.Expression = dataclasses.field() + + @property + def inputs( + self, + ) -> typing.Tuple[expression.Expression, ...]: + return (self.left, self.right) + + def replace_args( + self, larg: expression.Expression, rarg: expression.Expression + ) -> BinaryAggregation: + return BinaryAggregation(self.op, larg, rarg) diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index b47637cb59..b37c581a4a 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -24,6 +24,7 @@ import pandas import pyarrow as pa +from bigframes.core import agg_expressions import bigframes.core.expression as ex import bigframes.core.guid import bigframes.core.identifiers as ids @@ -190,7 +191,7 @@ def row_count(self) -> ArrayValue: child=self.node, aggregations=( ( - ex.NullaryAggregation(agg_ops.size_op), + agg_expressions.NullaryAggregation(agg_ops.size_op), ids.ColumnId(bigframes.core.guid.generate_guid()), ), ), @@ -379,7 +380,7 @@ def drop_columns(self, columns: Iterable[str]) -> ArrayValue: def aggregate( self, - aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], + aggregations: typing.Sequence[typing.Tuple[agg_expressions.Aggregation, str]], by_column_ids: typing.Sequence[str] = (), dropna: bool = True, ) -> ArrayValue: @@ -420,7 +421,7 @@ def project_window_op( """ return self.project_window_expr( - ex.UnaryAggregation(op, ex.deref(column_name)), + agg_expressions.UnaryAggregation(op, ex.deref(column_name)), window_spec, never_skip_nulls, skip_reproject_unsafe, @@ -428,7 +429,7 @@ def project_window_op( def project_window_expr( self, - expression: ex.Aggregation, + expression: agg_expressions.Aggregation, window: WindowSpec, never_skip_nulls=False, skip_reproject_unsafe: bool = False, diff --git a/bigframes/core/bigframe_node.py b/bigframes/core/bigframe_node.py index 0c6f56f35a..7e40248a00 100644 --- a/bigframes/core/bigframe_node.py +++ b/bigframes/core/bigframe_node.py @@ -20,15 +20,12 @@ import functools import itertools import typing -from typing import Callable, Dict, Generator, Iterable, Mapping, Sequence, Tuple, Union +from typing import Callable, Dict, Generator, Iterable, Mapping, Sequence, Tuple from bigframes.core import expression, field, identifiers import bigframes.core.schema as schemata import bigframes.dtypes -if typing.TYPE_CHECKING: - import bigframes.session - COLUMN_SET = frozenset[identifiers.ColumnId] T = typing.TypeVar("T") @@ -281,8 +278,8 @@ def field_by_id(self) -> Mapping[identifiers.ColumnId, field.Field]: @property def _node_expressions( self, - ) -> Sequence[Union[expression.Expression, expression.Aggregation]]: - """List of scalar expressions. Intended for checking engine compatibility with used ops.""" + ) -> Sequence[expression.Expression]: + """List of expressions. Intended for checking engine compatibility with used ops.""" return () # Plan algorithms diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 465728b0ef..279643b91d 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -21,12 +21,12 @@ import pandas as pd import bigframes.constants +from bigframes.core import agg_expressions import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.expression as ex import bigframes.core.ordering as ordering import bigframes.core.window_spec as windows -import bigframes.dtypes import bigframes.dtypes as dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -133,7 +133,7 @@ def quantile( block, _ = block.aggregate( grouping_column_ids, tuple( - ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(col)) + agg_expressions.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(col)) for col in quantile_cols ), column_labels=pd.Index(labels), @@ -363,7 +363,7 @@ def value_counts( block = dropna(block, columns, how="any") block, agg_ids = block.aggregate( by_column_ids=(*grouping_keys, *columns), - aggregations=[ex.NullaryAggregation(agg_ops.size_op)], + aggregations=[agg_expressions.NullaryAggregation(agg_ops.size_op)], dropna=drop_na and not grouping_keys, ) count_id = agg_ids[0] @@ -647,15 +647,15 @@ def skew( # counts, moment3 for each column aggregations = [] for i, col in enumerate(original_columns): - count_agg = ex.UnaryAggregation( + count_agg = agg_expressions.UnaryAggregation( agg_ops.count_op, ex.deref(col), ) - moment3_agg = ex.UnaryAggregation( + moment3_agg = agg_expressions.UnaryAggregation( agg_ops.mean_op, ex.deref(delta3_ids[i]), ) - variance_agg = ex.UnaryAggregation( + variance_agg = agg_expressions.UnaryAggregation( agg_ops.PopVarOp(), ex.deref(col), ) @@ -698,9 +698,13 @@ def kurt( # counts, moment4 for each column aggregations = [] for i, col in enumerate(original_columns): - count_agg = ex.UnaryAggregation(agg_ops.count_op, ex.deref(col)) - moment4_agg = ex.UnaryAggregation(agg_ops.mean_op, ex.deref(delta4_ids[i])) - variance_agg = ex.UnaryAggregation(agg_ops.PopVarOp(), ex.deref(col)) + count_agg = agg_expressions.UnaryAggregation(agg_ops.count_op, ex.deref(col)) + moment4_agg = agg_expressions.UnaryAggregation( + agg_ops.mean_op, ex.deref(delta4_ids[i]) + ) + variance_agg = agg_expressions.UnaryAggregation( + agg_ops.PopVarOp(), ex.deref(col) + ) aggregations.extend([count_agg, moment4_agg, variance_agg]) block, agg_ids = block.aggregate( diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 07d7e4c45b..d62173b7d6 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -51,8 +51,9 @@ from bigframes import session from bigframes._config import sampling_options import bigframes.constants -from bigframes.core import local_data +from bigframes.core import agg_expressions, local_data import bigframes.core as core +import bigframes.core.agg_expressions as ex_types import bigframes.core.compile.googlesql as googlesql import bigframes.core.expression as ex import bigframes.core.expression as scalars @@ -102,15 +103,24 @@ class PandasBatches(Iterator[pd.DataFrame]): """Interface for mutable objects with state represented by a block value object.""" def __init__( - self, pandas_batches: Iterator[pd.DataFrame], total_rows: Optional[int] = 0 + self, + pandas_batches: Iterator[pd.DataFrame], + total_rows: Optional[int] = 0, + *, + total_bytes_processed: Optional[int] = 0, ): self._dataframes: Iterator[pd.DataFrame] = pandas_batches self._total_rows: Optional[int] = total_rows + self._total_bytes_processed: Optional[int] = total_bytes_processed @property def total_rows(self) -> Optional[int]: return self._total_rows + @property + def total_bytes_processed(self) -> Optional[int]: + return self._total_bytes_processed + def __next__(self) -> pd.DataFrame: return next(self._dataframes) @@ -721,7 +731,9 @@ def to_pandas_batches( if (total_rows is not None) and (max_results is not None): total_rows = min(total_rows, max_results) - return PandasBatches(dfs, total_rows) + return PandasBatches( + dfs, total_rows, total_bytes_processed=execute_result.total_bytes_processed + ) def _copy_index_to_pandas(self, df: pd.DataFrame) -> pd.DataFrame: """Set the index on pandas DataFrame to match this block.""" @@ -1143,7 +1155,7 @@ def apply_window_op( skip_reproject_unsafe: bool = False, never_skip_nulls: bool = False, ) -> typing.Tuple[Block, str]: - agg_expr = ex.UnaryAggregation(op, ex.deref(column)) + agg_expr = agg_expressions.UnaryAggregation(op, ex.deref(column)) return self.apply_analytic( agg_expr, window_spec, @@ -1155,7 +1167,7 @@ def apply_window_op( def apply_analytic( self, - agg_expr: ex.Aggregation, + agg_expr: agg_expressions.Aggregation, window: windows.WindowSpec, result_label: Label, *, @@ -1248,9 +1260,9 @@ def aggregate_all_and_stack( if axis_n == 0: aggregations = [ ( - ex.UnaryAggregation(operation, ex.deref(col_id)) + agg_expressions.UnaryAggregation(operation, ex.deref(col_id)) if isinstance(operation, agg_ops.UnaryAggregateOp) - else ex.NullaryAggregation(operation), + else agg_expressions.NullaryAggregation(operation), col_id, ) for col_id in self.value_columns @@ -1279,7 +1291,10 @@ def aggregate_size( ): """Returns a block object to compute the size(s) of groups.""" agg_specs = [ - (ex.NullaryAggregation(agg_ops.SizeOp()), guid.generate_guid()), + ( + agg_expressions.NullaryAggregation(agg_ops.SizeOp()), + guid.generate_guid(), + ), ] output_col_ids = [agg_spec[1] for agg_spec in agg_specs] result_expr = self.expr.aggregate(agg_specs, by_column_ids, dropna=dropna) @@ -1350,7 +1365,7 @@ def remap_f(x): def aggregate( self, by_column_ids: typing.Sequence[str] = (), - aggregations: typing.Sequence[ex.Aggregation] = (), + aggregations: typing.Sequence[agg_expressions.Aggregation] = (), column_labels: Optional[pd.Index] = None, *, dropna: bool = True, @@ -1419,9 +1434,9 @@ def get_stat( aggregations = [ ( - ex.UnaryAggregation(stat, ex.deref(column_id)) + agg_expressions.UnaryAggregation(stat, ex.deref(column_id)) if isinstance(stat, agg_ops.UnaryAggregateOp) - else ex.NullaryAggregation(stat), + else agg_expressions.NullaryAggregation(stat), stat.name, ) for stat in stats_to_fetch @@ -1447,7 +1462,7 @@ def get_binary_stat( # TODO(kemppeterson): Add a cache here. aggregations = [ ( - ex.BinaryAggregation( + agg_expressions.BinaryAggregation( stat, ex.deref(column_id_left), ex.deref(column_id_right) ), f"{stat.name}_{column_id_left}{column_id_right}", @@ -1474,9 +1489,9 @@ def summarize( labels = pd.Index([stat.name for stat in stats]) aggregations = [ ( - ex.UnaryAggregation(stat, ex.deref(col_id)) + agg_expressions.UnaryAggregation(stat, ex.deref(col_id)) if isinstance(stat, agg_ops.UnaryAggregateOp) - else ex.NullaryAggregation(stat), + else agg_expressions.NullaryAggregation(stat), f"{col_id}-{stat.name}", ) for stat in stats @@ -1750,7 +1765,7 @@ def pivot( block = block.select_columns(column_ids) aggregations = [ - ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(col_id)) + agg_expressions.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(col_id)) for col_id in column_ids ] result_block, _ = block.aggregate( @@ -2018,7 +2033,7 @@ def _generate_resample_label( agg_specs = [ ( - ex.UnaryAggregation(agg_ops.min_op, ex.deref(col_id)), + agg_expressions.UnaryAggregation(agg_ops.min_op, ex.deref(col_id)), guid.generate_guid(), ), ] @@ -2047,13 +2062,13 @@ def _generate_resample_label( # Generate integer label sequence. min_agg_specs = [ ( - ex.UnaryAggregation(agg_ops.min_op, ex.deref(label_col_id)), + ex_types.UnaryAggregation(agg_ops.min_op, ex.deref(label_col_id)), guid.generate_guid(), ), ] max_agg_specs = [ ( - ex.UnaryAggregation(agg_ops.max_op, ex.deref(label_col_id)), + ex_types.UnaryAggregation(agg_ops.max_op, ex.deref(label_col_id)), guid.generate_guid(), ), ] diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index f7de5c051a..b28880d498 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -30,6 +30,7 @@ import pyarrow as pa from bigframes.core import utils +import bigframes.core.agg_expressions as ex_types import bigframes.core.compile.googlesql import bigframes.core.compile.ibis_compiler.aggregate_compiler as agg_compiler import bigframes.core.compile.ibis_compiler.scalar_op_compiler as op_compilers @@ -215,7 +216,7 @@ def filter(self, predicate: ex.Expression) -> UnorderedIR: def aggregate( self, - aggregations: typing.Sequence[tuple[ex.Aggregation, str]], + aggregations: typing.Sequence[tuple[ex_types.Aggregation, str]], by_column_ids: typing.Sequence[ex.DerefOp] = (), order_by: typing.Sequence[OrderingExpression] = (), ) -> UnorderedIR: @@ -401,7 +402,7 @@ def isin_join( def project_window_op( self, - expression: ex.Aggregation, + expression: ex_types.Aggregation, window_spec: WindowSpec, output_name: str, *, @@ -467,7 +468,9 @@ def project_window_op( lambda x, y: x & y, per_col_does_count ).cast(int) observation_count = agg_compiler.compile_analytic( - ex.UnaryAggregation(agg_ops.sum_op, ex.deref("_observation_count")), + ex_types.UnaryAggregation( + agg_ops.sum_op, ex.deref("_observation_count") + ), window, bindings={"_observation_count": is_observation}, ) @@ -476,7 +479,7 @@ def project_window_op( # notnull is just used to convert null values to non-null (FALSE) values to be counted is_observation = inputs[0].notnull() observation_count = agg_compiler.compile_analytic( - ex.UnaryAggregation( + ex_types.UnaryAggregation( agg_ops.count_op, ex.deref("_observation_count") ), window, diff --git a/bigframes/core/compile/ibis_compiler/aggregate_compiler.py b/bigframes/core/compile/ibis_compiler/aggregate_compiler.py index 291db44524..1907078690 100644 --- a/bigframes/core/compile/ibis_compiler/aggregate_compiler.py +++ b/bigframes/core/compile/ibis_compiler/aggregate_compiler.py @@ -26,10 +26,10 @@ import bigframes_vendored.ibis.expr.types as ibis_types import pandas as pd +from bigframes.core import agg_expressions from bigframes.core.compile import constants as compiler_constants import bigframes.core.compile.ibis_compiler.scalar_op_compiler as scalar_compilers import bigframes.core.compile.ibis_types as compile_ibis_types -import bigframes.core.expression as ex import bigframes.core.window_spec as window_spec import bigframes.operations.aggregations as agg_ops @@ -48,19 +48,19 @@ def approx_quantiles(expression: float, number) -> List[float]: def compile_aggregate( - aggregate: ex.Aggregation, + aggregate: agg_expressions.Aggregation, bindings: typing.Dict[str, ibis_types.Value], order_by: typing.Sequence[ibis_types.Value] = [], ) -> ibis_types.Value: - if isinstance(aggregate, ex.NullaryAggregation): + if isinstance(aggregate, agg_expressions.NullaryAggregation): return compile_nullary_agg(aggregate.op) - if isinstance(aggregate, ex.UnaryAggregation): + if isinstance(aggregate, agg_expressions.UnaryAggregation): input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings) if not aggregate.op.order_independent: return compile_ordered_unary_agg(aggregate.op, input, order_by=order_by) # type: ignore else: return compile_unary_agg(aggregate.op, input) # type: ignore - elif isinstance(aggregate, ex.BinaryAggregation): + elif isinstance(aggregate, agg_expressions.BinaryAggregation): left = scalar_compiler.compile_expression(aggregate.left, bindings=bindings) right = scalar_compiler.compile_expression(aggregate.right, bindings=bindings) return compile_binary_agg(aggregate.op, left, right) # type: ignore @@ -69,16 +69,16 @@ def compile_aggregate( def compile_analytic( - aggregate: ex.Aggregation, + aggregate: agg_expressions.Aggregation, window: window_spec.WindowSpec, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: - if isinstance(aggregate, ex.NullaryAggregation): + if isinstance(aggregate, agg_expressions.NullaryAggregation): return compile_nullary_agg(aggregate.op, window) - elif isinstance(aggregate, ex.UnaryAggregation): + elif isinstance(aggregate, agg_expressions.UnaryAggregation): input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings) return compile_unary_agg(aggregate.op, input, window) # type: ignore - elif isinstance(aggregate, ex.BinaryAggregation): + elif isinstance(aggregate, agg_expressions.BinaryAggregation): raise NotImplementedError("binary analytic operations not yet supported") else: raise ValueError(f"Unexpected analytic operation: {aggregate}") @@ -676,6 +676,29 @@ def _( ).to_expr() +@compile_ordered_unary_agg.register +def _( + op: agg_ops.StringAggOp, + column: ibis_types.Column, + window=None, + order_by: typing.Sequence[ibis_types.Value] = [], +) -> ibis_types.ArrayValue: + if window is not None: + raise NotImplementedError( + f"StringAgg with windowing is not supported. {constants.FEEDBACK_LINK}" + ) + + return ( + ibis_ops.StringAgg( + column, # type: ignore + sep=op.sep, # type: ignore + order_by=order_by, # type: ignore + ) + .to_expr() + .fill_null(ibis_types.literal("")) + ) + + @compile_binary_agg.register def _( op: agg_ops.CorrOp, left: ibis_types.Column, right: ibis_types.Column, window=None diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 969ae2659d..044fc90306 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1216,11 +1216,18 @@ def to_arry_op_impl(*values: ibis_types.Value): def array_reduce_op_impl(x: ibis_types.Value, op: ops.ArrayReduceOp): import bigframes.core.compile.ibis_compiler.aggregate_compiler as agg_compilers - return typing.cast(ibis_types.ArrayValue, x).reduce( - lambda arr_vals: agg_compilers.compile_unary_agg( - op.aggregation, typing.cast(ibis_types.Column, arr_vals) + if op.aggregation.order_independent: + return typing.cast(ibis_types.ArrayValue, x).reduce( + lambda arr_vals: agg_compilers.compile_unary_agg( + op.aggregation, typing.cast(ibis_types.Column, arr_vals) + ) + ) + else: + return typing.cast(ibis_types.ArrayValue, x).reduce( + lambda arr_vals: agg_compilers.compile_ordered_unary_agg( + op.aggregation, typing.cast(ibis_types.Column, arr_vals) + ) ) - ) # JSON Ops diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 70fa516e51..df84f08852 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -22,7 +22,7 @@ import pandas as pd import bigframes.core -from bigframes.core import identifiers, nodes, ordering, window_spec +from bigframes.core import agg_expressions, identifiers, nodes, ordering, window_spec from bigframes.core.compile.polars import lowering import bigframes.core.expression as ex import bigframes.core.guid as guid @@ -443,15 +443,15 @@ class PolarsAggregateCompiler: def get_args( self, - agg: ex.Aggregation, + agg: agg_expressions.Aggregation, ) -> Sequence[pl.Expr]: """Prepares arguments for aggregation by compiling them.""" - if isinstance(agg, ex.NullaryAggregation): + if isinstance(agg, agg_expressions.NullaryAggregation): return [] - elif isinstance(agg, ex.UnaryAggregation): + elif isinstance(agg, agg_expressions.UnaryAggregation): arg = self.scalar_compiler.compile_expression(agg.arg) return [arg] - elif isinstance(agg, ex.BinaryAggregation): + elif isinstance(agg, agg_expressions.BinaryAggregation): larg = self.scalar_compiler.compile_expression(agg.left) rarg = self.scalar_compiler.compile_expression(agg.right) return [larg, rarg] @@ -460,13 +460,13 @@ def get_args( f"Aggregation {agg} not yet supported in polars engine." ) - def compile_agg_expr(self, expr: ex.Aggregation): - if isinstance(expr, ex.NullaryAggregation): + def compile_agg_expr(self, expr: agg_expressions.Aggregation): + if isinstance(expr, agg_expressions.NullaryAggregation): inputs: Tuple = () - elif isinstance(expr, ex.UnaryAggregation): + elif isinstance(expr, agg_expressions.UnaryAggregation): assert isinstance(expr.arg, ex.DerefOp) inputs = (expr.arg.id.sql,) - elif isinstance(expr, ex.BinaryAggregation): + elif isinstance(expr, agg_expressions.BinaryAggregation): assert isinstance(expr.left, ex.DerefOp) assert isinstance(expr.right, ex.DerefOp) inputs = ( @@ -769,7 +769,9 @@ def compile_agg(self, node: nodes.AggregateNode): def _aggregate( self, df: pl.LazyFrame, - aggregations: Sequence[Tuple[ex.Aggregation, identifiers.ColumnId]], + aggregations: Sequence[ + Tuple[agg_expressions.Aggregation, identifiers.ColumnId] + ], grouping_keys: Tuple[ex.DerefOp, ...], ) -> pl.LazyFrame: # Need to materialize columns to broadcast constants @@ -858,7 +860,7 @@ def compile_window(self, node: nodes.WindowOpNode): def _calc_row_analytic_func( self, frame: pl.LazyFrame, - agg_expr: ex.Aggregation, + agg_expr: agg_expressions.Aggregation, window: window_spec.WindowSpec, name: str, ) -> pl.LazyFrame: diff --git a/bigframes/core/compile/sqlglot/aggregate_compiler.py b/bigframes/core/compile/sqlglot/aggregate_compiler.py index 52ef4cc26c..ccfba1ce0f 100644 --- a/bigframes/core/compile/sqlglot/aggregate_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregate_compiler.py @@ -15,7 +15,7 @@ import sqlglot.expressions as sge -from bigframes.core import expression, window_spec +from bigframes.core import agg_expressions, window_spec from bigframes.core.compile.sqlglot.aggregations import ( binary_compiler, nullary_compiler, @@ -27,13 +27,13 @@ def compile_aggregate( - aggregate: expression.Aggregation, + aggregate: agg_expressions.Aggregation, order_by: tuple[sge.Expression, ...], ) -> sge.Expression: """Compiles BigFrames aggregation expression into SQLGlot expression.""" - if isinstance(aggregate, expression.NullaryAggregation): + if isinstance(aggregate, agg_expressions.NullaryAggregation): return nullary_compiler.compile(aggregate.op) - if isinstance(aggregate, expression.UnaryAggregation): + if isinstance(aggregate, agg_expressions.UnaryAggregation): column = typed_expr.TypedExpr( scalar_compiler.compile_scalar_expression(aggregate.arg), aggregate.arg.output_type, @@ -44,7 +44,7 @@ def compile_aggregate( ) else: return unary_compiler.compile(aggregate.op, column) - elif isinstance(aggregate, expression.BinaryAggregation): + elif isinstance(aggregate, agg_expressions.BinaryAggregation): left = typed_expr.TypedExpr( scalar_compiler.compile_scalar_expression(aggregate.left), aggregate.left.output_type, @@ -59,18 +59,18 @@ def compile_aggregate( def compile_analytic( - aggregate: expression.Aggregation, + aggregate: agg_expressions.Aggregation, window: window_spec.WindowSpec, ) -> sge.Expression: - if isinstance(aggregate, expression.NullaryAggregation): + if isinstance(aggregate, agg_expressions.NullaryAggregation): return nullary_compiler.compile(aggregate.op) - if isinstance(aggregate, expression.UnaryAggregation): + if isinstance(aggregate, agg_expressions.UnaryAggregation): column = typed_expr.TypedExpr( scalar_compiler.compile_scalar_expression(aggregate.arg), aggregate.arg.output_type, ) return unary_compiler.compile(aggregate.op, column, window) - elif isinstance(aggregate, expression.BinaryAggregation): + elif isinstance(aggregate, agg_expressions.BinaryAggregation): raise NotImplementedError("binary analytic operations not yet supported") else: raise ValueError(f"Unexpected analytic operation: {aggregate}") diff --git a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py index 98f1603be7..f519aef70d 100644 --- a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py @@ -27,6 +27,7 @@ import bigframes.core.compile.sqlglot.expressions.constants as constants from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr +import bigframes.dtypes as dtypes UNARY_OP_REGISTRATION = OpRegistration() @@ -420,7 +421,28 @@ def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: @UNARY_OP_REGISTRATION.register(ops.IsInOp) def _(op: ops.IsInOp, expr: TypedExpr) -> sge.Expression: - return sge.In(this=expr.expr, expressions=[sge.convert(v) for v in op.values]) + values = [] + is_numeric_expr = dtypes.is_numeric(expr.dtype) + for value in op.values: + if value is None: + continue + dtype = dtypes.bigframes_type(type(value)) + if expr.dtype == dtype or is_numeric_expr and dtypes.is_numeric(dtype): + values.append(sge.convert(value)) + + if op.match_nulls: + contains_nulls = any(_is_null(value) for value in op.values) + if contains_nulls: + return sge.Is(this=expr.expr, expression=sge.Null()) | sge.In( + this=expr.expr, expressions=values + ) + + if len(values) == 0: + return sge.convert(False) + + return sge.func( + "COALESCE", sge.In(this=expr.expr, expressions=values), sge.convert(False) + ) @UNARY_OP_REGISTRATION.register(ops.isalnum_op) @@ -767,7 +789,7 @@ def _(op: ops.ToTimedeltaOp, expr: TypedExpr) -> sge.Expression: factor = UNIT_TO_US_CONVERSION_FACTORS[op.unit] if factor != 1: value = sge.Mul(this=value, expression=sge.convert(factor)) - return sge.Interval(this=value, unit=sge.Identifier(this="MICROSECOND")) + return value @UNARY_OP_REGISTRATION.register(ops.UnixMicros) @@ -866,3 +888,9 @@ def _(op: ops.ZfillOp, expr: TypedExpr) -> sge.Expression: ], default=sge.func("LPAD", expr.expr, sge.convert(op.width), sge.convert("0")), ) + + +# Helpers +def _is_null(value) -> bool: + # float NaN/inf should be treated as distinct from 'true' null values + return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 0e94193bd3..59679f1bc4 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -27,7 +27,6 @@ from bigframes.core import field import bigframes.core.identifiers as ids import bigframes.operations -import bigframes.operations.aggregations as agg_ops def const( @@ -44,118 +43,6 @@ def free_var(id: str) -> UnboundVariableExpression: return UnboundVariableExpression(id) -@dataclasses.dataclass(frozen=True) -class Aggregation(abc.ABC): - """Represents windowing or aggregation over a column.""" - - op: agg_ops.WindowOp = dataclasses.field() - - @abc.abstractmethod - def output_type( - self, input_fields: Mapping[ids.ColumnId, field.Field] - ) -> dtypes.ExpressionType: - ... - - @property - def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: - return () - - @abc.abstractmethod - def remap_column_refs( - self, - name_mapping: Mapping[ids.ColumnId, ids.ColumnId], - allow_partial_bindings: bool = False, - ) -> Aggregation: - ... - - -@dataclasses.dataclass(frozen=True) -class NullaryAggregation(Aggregation): - op: agg_ops.NullaryWindowOp = dataclasses.field() - - def output_type( - self, input_fields: Mapping[ids.ColumnId, field.Field] - ) -> dtypes.ExpressionType: - return self.op.output_type() - - def remap_column_refs( - self, - name_mapping: Mapping[ids.ColumnId, ids.ColumnId], - allow_partial_bindings: bool = False, - ) -> NullaryAggregation: - return self - - -@dataclasses.dataclass(frozen=True) -class UnaryAggregation(Aggregation): - op: agg_ops.UnaryWindowOp - arg: Union[DerefOp, ScalarConstantExpression] - - def output_type( - self, input_fields: Mapping[ids.ColumnId, field.Field] - ) -> dtypes.ExpressionType: - # TODO(b/419300717) Remove resolutions once defers are cleaned up. - resolved_expr = bind_schema_fields(self.arg, input_fields) - assert resolved_expr.is_resolved - - return self.op.output_type(resolved_expr.output_type) - - @property - def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: - return self.arg.column_references - - def remap_column_refs( - self, - name_mapping: Mapping[ids.ColumnId, ids.ColumnId], - allow_partial_bindings: bool = False, - ) -> UnaryAggregation: - return UnaryAggregation( - self.op, - self.arg.remap_column_refs( - name_mapping, allow_partial_bindings=allow_partial_bindings - ), - ) - - -@dataclasses.dataclass(frozen=True) -class BinaryAggregation(Aggregation): - op: agg_ops.BinaryAggregateOp = dataclasses.field() - left: Union[DerefOp, ScalarConstantExpression] = dataclasses.field() - right: Union[DerefOp, ScalarConstantExpression] = dataclasses.field() - - def output_type( - self, input_fields: Mapping[ids.ColumnId, field.Field] - ) -> dtypes.ExpressionType: - # TODO(b/419300717) Remove resolutions once defers are cleaned up. - left_resolved_expr = bind_schema_fields(self.left, input_fields) - assert left_resolved_expr.is_resolved - right_resolved_expr = bind_schema_fields(self.right, input_fields) - assert right_resolved_expr.is_resolved - - return self.op.output_type( - left_resolved_expr.output_type, left_resolved_expr.output_type - ) - - @property - def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: - return (*self.left.column_references, *self.right.column_references) - - def remap_column_refs( - self, - name_mapping: Mapping[ids.ColumnId, ids.ColumnId], - allow_partial_bindings: bool = False, - ) -> BinaryAggregation: - return BinaryAggregation( - self.op, - self.left.remap_column_refs( - name_mapping, allow_partial_bindings=allow_partial_bindings - ), - self.right.remap_column_refs( - name_mapping, allow_partial_bindings=allow_partial_bindings - ), - ) - - TExpression = TypeVar("TExpression", bound="Expression") diff --git a/bigframes/core/groupby/aggs.py b/bigframes/core/groupby/aggs.py index 26257cc9b6..9d8b957d54 100644 --- a/bigframes/core/groupby/aggs.py +++ b/bigframes/core/groupby/aggs.py @@ -14,13 +14,13 @@ from __future__ import annotations -from bigframes.core import expression +from bigframes.core import agg_expressions, expression from bigframes.operations import aggregations as agg_ops -def agg(input: str, op: agg_ops.AggregateOp) -> expression.Aggregation: +def agg(input: str, op: agg_ops.AggregateOp) -> agg_expressions.Aggregation: if isinstance(op, agg_ops.UnaryAggregateOp): - return expression.UnaryAggregation(op, expression.deref(input)) + return agg_expressions.UnaryAggregation(op, expression.deref(input)) else: assert isinstance(op, agg_ops.NullaryAggregateOp) - return expression.NullaryAggregation(op) + return agg_expressions.NullaryAggregation(op) diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py index e4e4b313f9..3f5480436a 100644 --- a/bigframes/core/groupby/dataframe_group_by.py +++ b/bigframes/core/groupby/dataframe_group_by.py @@ -24,6 +24,7 @@ import pandas as pd from bigframes import session +from bigframes.core import agg_expressions from bigframes.core import expression as ex from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops @@ -327,7 +328,7 @@ def cumcount(self, ascending: bool = True) -> series.Series: ) ) block, result_id = self._block.apply_analytic( - ex.NullaryAggregation(agg_ops.size_op), + agg_expressions.NullaryAggregation(agg_ops.size_op), window=window_spec, result_label=None, ) @@ -488,7 +489,7 @@ def _agg_string(self, func: str) -> df.DataFrame: return dataframe if self._as_index else self._convert_index(dataframe) def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: - aggregations: typing.List[ex.Aggregation] = [] + aggregations: typing.List[agg_expressions.Aggregation] = [] column_labels = [] want_aggfunc_level = any(utils.is_list_like(aggs) for aggs in func.values()) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index f8ec38621d..2a35ab6546 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -27,6 +27,7 @@ import pandas from bigframes import dtypes +import bigframes.core.agg_expressions as ex_types import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.expression as ex @@ -282,7 +283,7 @@ def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]: filtered_block = block_with_offsets.filter_by_id(match_col_id) # Check if key exists at all by counting - count_agg = ex.UnaryAggregation(agg_ops.count_op, ex.deref(offsets_id)) + count_agg = ex_types.UnaryAggregation(agg_ops.count_op, ex.deref(offsets_id)) count_result = filtered_block._expr.aggregate([(count_agg, "count")]) count_scalar = self._block.session._executor.execute( @@ -294,7 +295,7 @@ def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]: # If only one match, return integer position if count_scalar == 1: - min_agg = ex.UnaryAggregation(agg_ops.min_op, ex.deref(offsets_id)) + min_agg = ex_types.UnaryAggregation(agg_ops.min_op, ex.deref(offsets_id)) position_result = filtered_block._expr.aggregate([(min_agg, "position")]) position_scalar = self._block.session._executor.execute( position_result, ex_spec.ExecutionSpec(promise_under_10gb=True) @@ -317,11 +318,11 @@ def _get_monotonic_slice(self, filtered_block, offsets_id: str) -> slice: # Combine min and max aggregations into a single query for efficiency min_max_aggs = [ ( - ex.UnaryAggregation(agg_ops.min_op, ex.deref(offsets_id)), + ex_types.UnaryAggregation(agg_ops.min_op, ex.deref(offsets_id)), "min_pos", ), ( - ex.UnaryAggregation(agg_ops.max_op, ex.deref(offsets_id)), + ex_types.UnaryAggregation(agg_ops.max_op, ex.deref(offsets_id)), "max_pos", ), ] diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index 958113dda3..c214d0bb7e 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -277,7 +277,10 @@ def _adapt_pandas_series( ) return pa.array(series, type=pa.string()), bigframes.dtypes.GEO_DTYPE try: - return _adapt_arrow_array(pa.array(series)) + pa_arr = pa.array(series) + if isinstance(pa_arr, pa.ChunkedArray): + return _adapt_chunked_array(pa_arr) + return _adapt_arrow_array(pa_arr) except pa.ArrowInvalid as e: if series.dtype == np.dtype("O"): try: diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index cf6e8a7e5c..b6483689dc 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -33,7 +33,7 @@ import google.cloud.bigquery as bq -from bigframes.core import identifiers, local_data, sequences +from bigframes.core import agg_expressions, identifiers, local_data, sequences from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET import bigframes.core.expression as ex from bigframes.core.field import Field @@ -1337,7 +1337,9 @@ def remap_refs( @dataclasses.dataclass(frozen=True, eq=False) class AggregateNode(UnaryNode): - aggregations: typing.Tuple[typing.Tuple[ex.Aggregation, identifiers.ColumnId], ...] + aggregations: typing.Tuple[ + typing.Tuple[agg_expressions.Aggregation, identifiers.ColumnId], ... + ] by_column_ids: typing.Tuple[ex.DerefOp, ...] = tuple([]) order_by: Tuple[OrderingExpression, ...] = () dropna: bool = True @@ -1360,9 +1362,7 @@ def fields(self) -> Sequence[Field]: agg_items = ( Field( id, - bigframes.dtypes.dtype_for_etype( - agg.output_type(self.child.field_by_id) - ), + ex.bind_schema_fields(agg, self.child.field_by_id).output_type, nullable=True, ) for agg, id in self.aggregations @@ -1437,7 +1437,7 @@ def remap_refs( @dataclasses.dataclass(frozen=True, eq=False) class WindowOpNode(UnaryNode, AdditiveNode): - expression: ex.Aggregation + expression: agg_expressions.Aggregation window_spec: window.WindowSpec output_name: identifiers.ColumnId never_skip_nulls: bool = False @@ -1478,11 +1478,10 @@ def row_count(self) -> Optional[int]: @functools.cached_property def added_field(self) -> Field: - input_fields = self.child.field_by_id # TODO: Determine if output could be non-null return Field( self.output_name, - bigframes.dtypes.dtype_for_etype(self.expression.output_type(input_fields)), + ex.bind_schema_fields(self.expression, self.child.field_by_id).output_type, ) @property diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py index 5b5fb10753..881badd603 100644 --- a/bigframes/core/rewrite/order.py +++ b/bigframes/core/rewrite/order.py @@ -15,7 +15,7 @@ import functools from typing import Mapping, Tuple -from bigframes.core import expression, identifiers +from bigframes.core import agg_expressions, expression, identifiers import bigframes.core.nodes import bigframes.core.ordering import bigframes.core.window_spec @@ -167,9 +167,7 @@ def pull_up_order_inner( ) else: # Otherwise we need to generate offsets - agg = bigframes.core.expression.NullaryAggregation( - agg_ops.RowNumberOp() - ) + agg = agg_expressions.NullaryAggregation(agg_ops.RowNumberOp()) window_spec = bigframes.core.window_spec.unbound( ordering=tuple(child_order.all_ordering_columns) ) @@ -287,9 +285,7 @@ def pull_order_concat( new_source, ((order_expression.scalar_expression, offsets_id),) ) else: - agg = bigframes.core.expression.NullaryAggregation( - agg_ops.RowNumberOp() - ) + agg = agg_expressions.NullaryAggregation(agg_ops.RowNumberOp()) window_spec = bigframes.core.window_spec.unbound( ordering=tuple(order.all_ordering_columns) ) @@ -423,7 +419,7 @@ def remove_order_strict( def rewrite_promote_offsets( node: bigframes.core.nodes.PromoteOffsetsNode, ) -> bigframes.core.nodes.WindowOpNode: - agg = bigframes.core.expression.NullaryAggregation(agg_ops.RowNumberOp()) + agg = agg_expressions.NullaryAggregation(agg_ops.RowNumberOp()) window_spec = bigframes.core.window_spec.unbound() return bigframes.core.nodes.WindowOpNode(node.child, agg, window_spec, node.col_id) diff --git a/bigframes/core/rewrite/schema_binding.py b/bigframes/core/rewrite/schema_binding.py index cbecf83035..8a0bcc4921 100644 --- a/bigframes/core/rewrite/schema_binding.py +++ b/bigframes/core/rewrite/schema_binding.py @@ -15,7 +15,7 @@ import dataclasses import typing -from bigframes.core import bigframe_node +from bigframes.core import agg_expressions, bigframe_node from bigframes.core import expression as ex from bigframes.core import nodes, ordering @@ -118,16 +118,16 @@ def bind_schema_to_node( def _bind_schema_to_aggregation_expr( - aggregation: ex.Aggregation, + aggregation: agg_expressions.Aggregation, child: bigframe_node.BigFrameNode, -) -> ex.Aggregation: +) -> agg_expressions.Aggregation: assert isinstance( - aggregation, ex.Aggregation + aggregation, agg_expressions.Aggregation ), f"Expected Aggregation, got {type(aggregation)}" - if isinstance(aggregation, ex.UnaryAggregation): + if isinstance(aggregation, agg_expressions.UnaryAggregation): return typing.cast( - ex.Aggregation, + agg_expressions.Aggregation, dataclasses.replace( aggregation, arg=typing.cast( @@ -136,9 +136,9 @@ def _bind_schema_to_aggregation_expr( ), ), ) - elif isinstance(aggregation, ex.BinaryAggregation): + elif isinstance(aggregation, agg_expressions.BinaryAggregation): return typing.cast( - ex.Aggregation, + agg_expressions.Aggregation, dataclasses.replace( aggregation, left=typing.cast( diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index ea8e608a84..91c6ab83c6 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -20,6 +20,7 @@ from bigframes import dtypes from bigframes import operations as ops +from bigframes.core import agg_expressions as ex_types from bigframes.core import expression as ex from bigframes.core import nodes, schema, utils from bigframes.operations import aggregations as aggs @@ -219,33 +220,33 @@ def _rewrite_to_timedelta_op(op: ops.ToTimedeltaOp, arg: _TypedExpr): @functools.cache def _rewrite_aggregation( - aggregation: ex.Aggregation, schema: schema.ArraySchema -) -> ex.Aggregation: - if not isinstance(aggregation, ex.UnaryAggregation): + aggregation: ex_types.Aggregation, schema: schema.ArraySchema +) -> ex_types.Aggregation: + if not isinstance(aggregation, ex_types.UnaryAggregation): return aggregation if isinstance(aggregation.arg, ex.DerefOp): input_type = schema.get_type(aggregation.arg.id.sql) else: - input_type = aggregation.arg.dtype + input_type = aggregation.arg.output_type if isinstance(aggregation.op, aggs.DiffOp): if dtypes.is_datetime_like(input_type): - return ex.UnaryAggregation( + return ex_types.UnaryAggregation( aggs.TimeSeriesDiffOp(aggregation.op.periods), aggregation.arg ) elif input_type == dtypes.DATE_DTYPE: - return ex.UnaryAggregation( + return ex_types.UnaryAggregation( aggs.DateSeriesDiffOp(aggregation.op.periods), aggregation.arg ) if isinstance(aggregation.op, aggs.StdOp) and input_type == dtypes.TIMEDELTA_DTYPE: - return ex.UnaryAggregation( + return ex_types.UnaryAggregation( aggs.StdOp(should_floor_result=True), aggregation.arg ) if isinstance(aggregation.op, aggs.MeanOp) and input_type == dtypes.TIMEDELTA_DTYPE: - return ex.UnaryAggregation( + return ex_types.UnaryAggregation( aggs.MeanOp(should_floor_result=True), aggregation.arg ) @@ -253,7 +254,7 @@ def _rewrite_aggregation( isinstance(aggregation.op, aggs.QuantileOp) and input_type == dtypes.TIMEDELTA_DTYPE ): - return ex.UnaryAggregation( + return ex_types.UnaryAggregation( aggs.QuantileOp(q=aggregation.op.q, should_floor_result=True), aggregation.arg, ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f9de117b29..c65bbdd2c8 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -57,7 +57,7 @@ import bigframes._config.display_options as display_options import bigframes.constants import bigframes.core -from bigframes.core import log_adapter +from bigframes.core import agg_expressions, log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.convert @@ -1363,7 +1363,9 @@ def _fast_stat_matrix(self, op: agg_ops.BinaryAggregateOp) -> DataFrame: block = frame._block aggregations = [ - ex.BinaryAggregation(op, ex.deref(left_col), ex.deref(right_col)) + agg_expressions.BinaryAggregation( + op, ex.deref(left_col), ex.deref(right_col) + ) for left_col in block.value_columns for right_col in block.value_columns ] @@ -1630,7 +1632,7 @@ def corrwith( block, _ = block.aggregate( aggregations=tuple( - ex.BinaryAggregation(agg_ops.CorrOp(), left_ex, right_ex) + agg_expressions.BinaryAggregation(agg_ops.CorrOp(), left_ex, right_ex) for left_ex, right_ex in expr_pairs ), column_labels=labels, @@ -3189,9 +3191,9 @@ def agg( for agg_func in agg_func_list: agg_op = agg_ops.lookup_agg_func(typing.cast(str, agg_func)) agg_expr = ( - ex.UnaryAggregation(agg_op, ex.deref(col_id)) + agg_expressions.UnaryAggregation(agg_op, ex.deref(col_id)) if isinstance(agg_op, agg_ops.UnaryAggregateOp) - else ex.NullaryAggregation(agg_op) + else agg_expressions.NullaryAggregation(agg_op) ) aggs.append(agg_expr) labels.append(col_label) diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index a8c9f9c301..d994d6353a 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -51,6 +51,15 @@ } ) +# https://cloud.google.com/functions/docs/reference/rest/v2/projects.locations.functions#vpconnectoregresssettings +_VPC_EGRESS_SETTINGS_MAP = types.MappingProxyType( + { + "all": functions_v2.ServiceConfig.VpcConnectorEgressSettings.ALL_TRAFFIC, + "private-ranges-only": functions_v2.ServiceConfig.VpcConnectorEgressSettings.PRIVATE_RANGES_ONLY, + "unspecified": functions_v2.ServiceConfig.VpcConnectorEgressSettings.VPC_CONNECTOR_EGRESS_SETTINGS_UNSPECIFIED, + } +) + # BQ managed functions (@udf) currently only support Python 3.11. _MANAGED_FUNC_PYTHON_VERSION = "python-3.11" @@ -375,6 +384,7 @@ def create_cloud_function( max_instance_count=None, is_row_processor=False, vpc_connector=None, + vpc_connector_egress_settings="private-ranges-only", memory_mib=1024, ingress_settings="internal-only", ): @@ -472,6 +482,15 @@ def create_cloud_function( function.service_config.max_instance_count = max_instance_count if vpc_connector is not None: function.service_config.vpc_connector = vpc_connector + if vpc_connector_egress_settings not in _VPC_EGRESS_SETTINGS_MAP: + raise bf_formatting.create_exception_with_feedback_link( + ValueError, + f"'{vpc_connector_egress_settings}' not one of the supported vpc egress settings values: {list(_VPC_EGRESS_SETTINGS_MAP)}", + ) + function.service_config.vpc_connector_egress_settings = cast( + functions_v2.ServiceConfig.VpcConnectorEgressSettings, + _VPC_EGRESS_SETTINGS_MAP[vpc_connector_egress_settings], + ) function.service_config.service_account_email = ( self._cloud_function_service_account ) @@ -532,6 +551,7 @@ def provision_bq_remote_function( cloud_function_max_instance_count, is_row_processor, cloud_function_vpc_connector, + cloud_function_vpc_connector_egress_settings, cloud_function_memory_mib, cloud_function_ingress_settings, bq_metadata, @@ -580,6 +600,7 @@ def provision_bq_remote_function( max_instance_count=cloud_function_max_instance_count, is_row_processor=is_row_processor, vpc_connector=cloud_function_vpc_connector, + vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, memory_mib=cloud_function_memory_mib, ingress_settings=cloud_function_ingress_settings, ) diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index a2fb66539b..6b5c9bf071 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -245,6 +245,9 @@ def remote_function( cloud_function_timeout: Optional[int] = 600, cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, + cloud_function_vpc_connector_egress_settings: Literal[ + "all", "private-ranges-only", "unspecified" + ] = "private-ranges-only", cloud_function_memory_mib: Optional[int] = 1024, cloud_function_ingress_settings: Literal[ "all", "internal-only", "internal-and-gclb" @@ -425,6 +428,13 @@ def remote_function( function. This is useful if your code needs access to data or service(s) that are on a VPC network. See for more details https://cloud.google.com/functions/docs/networking/connecting-vpc. + cloud_function_vpc_connector_egress_settings (str, Optional): + Egress settings for the VPC connector, controlling what outbound + traffic is routed through the VPC connector. + Options are: `all`, `private-ranges-only`, or `unspecified`. + If not specified, `private-ranges-only` is used by default. + See for more details + https://cloud.google.com/run/docs/configuring/vpc-connectors#egress-job. cloud_function_memory_mib (int, Optional): The amounts of memory (in mebibytes) to allocate for the cloud function (2nd gen) created. This also dictates a corresponding @@ -616,6 +626,7 @@ def wrapper(func): cloud_function_max_instance_count=cloud_function_max_instances, is_row_processor=is_row_processor, cloud_function_vpc_connector=cloud_function_vpc_connector, + cloud_function_vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, cloud_function_memory_mib=cloud_function_memory_mib, cloud_function_ingress_settings=cloud_function_ingress_settings, bq_metadata=bqrf_metadata, diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 6889997a10..0ee80fd74b 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -17,14 +17,18 @@ import abc import dataclasses import typing -from typing import ClassVar, Iterable, Optional +from typing import ClassVar, Iterable, Optional, TYPE_CHECKING import pandas as pd import pyarrow as pa +from bigframes.core import agg_expressions import bigframes.dtypes as dtypes import bigframes.operations.type as signatures +if TYPE_CHECKING: + from bigframes.core import expression + @dataclasses.dataclass(frozen=True) class WindowOp: @@ -110,6 +114,14 @@ class NullaryAggregateOp(AggregateOp, NullaryWindowOp): def arguments(self) -> int: return 0 + def as_expr( + self, + *exprs: typing.Union[str, expression.Expression], + ) -> agg_expressions.NullaryAggregation: + from bigframes.core import agg_expressions + + return agg_expressions.NullaryAggregation(self) + @dataclasses.dataclass(frozen=True) class UnaryAggregateOp(AggregateOp, UnaryWindowOp): @@ -117,6 +129,23 @@ class UnaryAggregateOp(AggregateOp, UnaryWindowOp): def arguments(self) -> int: return 1 + def as_expr( + self, + *exprs: typing.Union[str, expression.Expression], + ) -> agg_expressions.UnaryAggregation: + from bigframes.core import agg_expressions + from bigframes.operations.base_ops import _convert_expr_input + + # Keep this in sync with output_type and compilers + inputs: list[expression.Expression] = [] + + for expr in exprs: + inputs.append(_convert_expr_input(expr)) + return agg_expressions.UnaryAggregation( + self, + inputs[0], + ) + @dataclasses.dataclass(frozen=True) class BinaryAggregateOp(AggregateOp): @@ -124,6 +153,21 @@ class BinaryAggregateOp(AggregateOp): def arguments(self) -> int: return 2 + def as_expr( + self, + *exprs: typing.Union[str, expression.Expression], + ) -> agg_expressions.BinaryAggregation: + from bigframes.core import agg_expressions + from bigframes.operations.base_ops import _convert_expr_input + + # Keep this in sync with output_type and compilers + inputs: list[expression.Expression] = [] + + for expr in exprs: + inputs.append(_convert_expr_input(expr)) + + return agg_expressions.BinaryAggregation(self, inputs[0], inputs[1]) + @dataclasses.dataclass(frozen=True) class SizeOp(NullaryAggregateOp): @@ -335,9 +379,26 @@ def skips_nulls(self): return True def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - return pd.ArrowDtype( - pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_types[0])) - ) + return dtypes.list_type(input_types[0]) + + +@dataclasses.dataclass(frozen=True) +class StringAggOp(UnaryAggregateOp): + name: ClassVar[str] = "string_agg" + sep: str = "," + + @property + def order_independent(self): + return False + + @property + def skips_nulls(self): + return True + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] != dtypes.STRING_DTYPE: + raise TypeError(f"Type {input_types[0]} is not string-like") + return dtypes.STRING_DTYPE @dataclasses.dataclass(frozen=True) diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 9022a1665e..4743483954 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -24,6 +24,7 @@ import bigframes.dataframe as df import bigframes.operations as ops from bigframes.operations._op_converters import convert_index, convert_slice +import bigframes.operations.aggregations as agg_ops import bigframes.operations.base import bigframes.series as series @@ -295,6 +296,11 @@ def cat( ) -> series.Series: return self._apply_binary_op(others, ops.strconcat_op, alignment=join) + def join(self, sep: str) -> series.Series: + return self._apply_unary_op( + ops.ArrayReduceOp(aggregation=agg_ops.StringAggOp(sep=sep)) + ) + def to_blob(self, connection: Optional[str] = None) -> series.Series: """Create a BigFrames Blob series from a series of URIs. diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 6ffed5b53f..9d4fc101f6 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -87,6 +87,9 @@ def remote_function( cloud_function_timeout: Optional[int] = 600, cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, + cloud_function_vpc_connector_egress_settings: Literal[ + "all", "private-ranges-only", "unspecified" + ] = "private-ranges-only", cloud_function_memory_mib: Optional[int] = 1024, cloud_function_ingress_settings: Literal[ "all", "internal-only", "internal-and-gclb" @@ -109,6 +112,7 @@ def remote_function( cloud_function_timeout=cloud_function_timeout, cloud_function_max_instances=cloud_function_max_instances, cloud_function_vpc_connector=cloud_function_vpc_connector, + cloud_function_vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, cloud_function_memory_mib=cloud_function_memory_mib, cloud_function_ingress_settings=cloud_function_ingress_settings, cloud_build_service_account=cloud_build_service_account, diff --git a/bigframes/series.py b/bigframes/series.py index c95b2ca37f..3e24a75d9b 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -49,7 +49,7 @@ import typing_extensions import bigframes.core -from bigframes.core import groupby, log_adapter +from bigframes.core import agg_expressions, groupby, log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.expression as ex @@ -1391,7 +1391,9 @@ def mode(self) -> Series: block, agg_ids = block.aggregate( by_column_ids=[self._value_column], aggregations=( - ex.UnaryAggregation(agg_ops.count_op, ex.deref(self._value_column)), + agg_expressions.UnaryAggregation( + agg_ops.count_op, ex.deref(self._value_column) + ), ), ) value_count_col_id = agg_ids[0] @@ -2116,7 +2118,11 @@ def unique(self, keep_order=True) -> Series: return self.drop_duplicates() block, result = self._block.aggregate( [self._value_column], - [ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(self._value_column))], + [ + agg_expressions.UnaryAggregation( + agg_ops.AnyValueOp(), ex.deref(self._value_column) + ) + ], column_labels=self._block.column_labels, dropna=False, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 432e73159a..6252a59e31 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -345,15 +345,6 @@ def bytes_processed_sum(self): @property def slot_millis_sum(self): """The sum of all slot time used by bigquery jobs in this session.""" - if not bigframes.options._allow_large_results: - msg = bfe.format_message( - "Queries executed with `allow_large_results=False` within the session will not " - "have their slot milliseconds counted in this sum. If you need precise slot " - "milliseconds information, query the `INFORMATION_SCHEMA` tables " - "to get relevant metrics.", - ) - warnings.warn(msg, UserWarning) - return self._metrics.slot_millis @property @@ -1519,6 +1510,9 @@ def remote_function( cloud_function_timeout: Optional[int] = 600, cloud_function_max_instances: Optional[int] = None, cloud_function_vpc_connector: Optional[str] = None, + cloud_function_vpc_connector_egress_settings: Literal[ + "all", "private-ranges-only", "unspecified" + ] = "private-ranges-only", cloud_function_memory_mib: Optional[int] = 1024, cloud_function_ingress_settings: Literal[ "all", "internal-only", "internal-and-gclb" @@ -1684,6 +1678,13 @@ def remote_function( function. This is useful if your code needs access to data or service(s) that are on a VPC network. See for more details https://cloud.google.com/functions/docs/networking/connecting-vpc. + cloud_function_vpc_connector_egress_settings (str, Optional): + Egress settings for the VPC connector, controlling what outbound + traffic is routed through the VPC connector. + Options are: `all`, `private-ranges-only`, or `unspecified`. + If not specified, `private-ranges-only` is used by default. + See for more details + https://cloud.google.com/run/docs/configuring/vpc-connectors#egress-job. cloud_function_memory_mib (int, Optional): The amounts of memory (in mebibytes) to allocate for the cloud function (2nd gen) created. This also dictates a corresponding @@ -1741,6 +1742,7 @@ def remote_function( cloud_function_timeout=cloud_function_timeout, cloud_function_max_instances=cloud_function_max_instances, cloud_function_vpc_connector=cloud_function_vpc_connector, + cloud_function_vpc_connector_egress_settings=cloud_function_vpc_connector_egress_settings, cloud_function_memory_mib=cloud_function_memory_mib, cloud_function_ingress_settings=cloud_function_ingress_settings, cloud_build_service_account=cloud_build_service_account, diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index b428cd646c..b7412346bd 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -323,7 +323,10 @@ def _export_gbq( self.bqclient.update_table(table, ["schema"]) return executor.ExecuteResult( - row_iter.to_arrow_iterable(), array_value.schema, query_job + row_iter.to_arrow_iterable(), + array_value.schema, + query_job, + total_bytes_processed=row_iter.total_bytes_processed, ) def dry_run( @@ -671,6 +674,7 @@ def _execute_plan_gbq( query_job=query_job, total_bytes=size_bytes, total_rows=iterator.total_rows, + total_bytes_processed=iterator.total_bytes_processed, ) diff --git a/bigframes/session/direct_gbq_execution.py b/bigframes/session/direct_gbq_execution.py index ff91747a62..7538c9300f 100644 --- a/bigframes/session/direct_gbq_execution.py +++ b/bigframes/session/direct_gbq_execution.py @@ -63,6 +63,7 @@ def execute( schema=plan.schema, query_job=query_job, total_rows=iterator.total_rows, + total_bytes_processed=iterator.total_bytes_processed, ) def _run_execute_query( diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 748b10647a..d0cfe5f4f7 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -45,6 +45,7 @@ class ExecuteResult: query_job: Optional[bigquery.QueryJob] = None total_bytes: Optional[int] = None total_rows: Optional[int] = None + total_bytes_processed: Optional[int] = None @property def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]: diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index d8df558fe4..a1e1d436e1 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -18,7 +18,14 @@ import pyarrow as pa -from bigframes.core import array_value, bigframe_node, expression, local_data, nodes +from bigframes.core import ( + agg_expressions, + array_value, + bigframe_node, + expression, + local_data, + nodes, +) import bigframes.operations from bigframes.operations import aggregations as agg_ops from bigframes.operations import ( @@ -112,7 +119,7 @@ def _is_node_polars_executable(node: nodes.BigFrameNode): if not isinstance(node, _COMPATIBLE_NODES): return False for expr in node._node_expressions: - if isinstance(expr, expression.Aggregation): + if isinstance(expr, agg_expressions.Aggregation): if not type(expr.op) in _COMPATIBLE_AGG_OPS: return False if isinstance(expr, expression.Expression): diff --git a/bigframes/version.py b/bigframes/version.py index 78b6498d2d..558f26d68e 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.18.0" +__version__ = "2.19.0" # {x-release-please-start-date} -__release_date__ = "2025-09-03" +__release_date__ = "2025-09-09" # {x-release-please-end} diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index f60786437f..22b623193d 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -1478,14 +1478,20 @@ def square_num(x): reuse=False, cloud_function_service_account="default", cloud_function_vpc_connector=gcf_vpc_connector, + cloud_function_vpc_connector_egress_settings="all", cloud_function_ingress_settings="all", )(square_num) - # assert that the GCF is created with the intended vpc connector gcf = rf_session.cloudfunctionsclient.get_function( name=square_num_remote.bigframes_cloud_function ) + + # assert that the GCF is created with the intended vpc connector and + # egress settings. assert gcf.service_config.vpc_connector == gcf_vpc_connector + # The value is since we set + # cloud_function_vpc_connector_egress_settings="all" earlier. + assert gcf.service_config.vpc_connector_egress_settings == 2 # assert that the function works as expected on data scalars_df, scalars_pandas_df = scalars_dfs diff --git a/tests/system/large/test_dataframe_io.py b/tests/system/large/test_dataframe_io.py index 87d2acd34b..c60940109d 100644 --- a/tests/system/large/test_dataframe_io.py +++ b/tests/system/large/test_dataframe_io.py @@ -48,11 +48,12 @@ def test_to_pandas_batches_override_global_option( ): with bigframes.option_context(LARGE_TABLE_OPTION, False): df = session.read_gbq(WIKIPEDIA_TABLE) - pages = list( - df.to_pandas_batches( - page_size=500, max_results=1500, allow_large_results=True - ) + batches = df.sort_values("id").to_pandas_batches( + page_size=500, max_results=1500, allow_large_results=True ) + assert batches.total_rows > 0 + assert batches.total_bytes_processed > 0 + pages = list(batches) assert all((len(page) <= 500) for page in pages) assert sum(len(page) for page in pages) == 1500 diff --git a/tests/system/small/engines/test_aggregation.py b/tests/system/small/engines/test_aggregation.py index c2fc9ad706..a4a49c622a 100644 --- a/tests/system/small/engines/test_aggregation.py +++ b/tests/system/small/engines/test_aggregation.py @@ -14,7 +14,7 @@ import pytest -from bigframes.core import array_value, expression, identifiers, nodes +from bigframes.core import agg_expressions, array_value, expression, identifiers, nodes import bigframes.operations.aggregations as agg_ops from bigframes.session import polars_executor from bigframes.testing.engine_utils import assert_equivalence_execution @@ -37,7 +37,7 @@ def apply_agg_to_all_valid( continue try: _ = op.output_type(array.get_column_type(arg)) - expr = expression.UnaryAggregation(op, expression.deref(arg)) + expr = agg_expressions.UnaryAggregation(op, expression.deref(arg)) name = f"{arg}-{op.name}" exprs_by_name.append((expr, name)) except TypeError: @@ -56,11 +56,11 @@ def test_engines_aggregate_size( scalars_array_value.node, aggregations=( ( - expression.NullaryAggregation(agg_ops.SizeOp()), + agg_expressions.NullaryAggregation(agg_ops.SizeOp()), identifiers.ColumnId("size_op"), ), ( - expression.UnaryAggregation( + agg_expressions.UnaryAggregation( agg_ops.SizeUnaryOp(), expression.deref("string_col") ), identifiers.ColumnId("unary_size_op"), @@ -103,11 +103,11 @@ def test_engines_grouped_aggregate( scalars_array_value.node, aggregations=( ( - expression.NullaryAggregation(agg_ops.SizeOp()), + agg_expressions.NullaryAggregation(agg_ops.SizeOp()), identifiers.ColumnId("size_op"), ), ( - expression.UnaryAggregation( + agg_expressions.UnaryAggregation( agg_ops.SizeUnaryOp(), expression.deref("string_col") ), identifiers.ColumnId("unary_size_op"), diff --git a/tests/system/small/engines/test_generic_ops.py b/tests/system/small/engines/test_generic_ops.py index 8deef3638e..14c6e9a454 100644 --- a/tests/system/small/engines/test_generic_ops.py +++ b/tests/system/small/engines/test_generic_ops.py @@ -392,7 +392,7 @@ def test_engines_invert_op(scalars_array_value: array_value.ArrayValue, engine): assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_isin_op(scalars_array_value: array_value.ArrayValue, engine): arr, col_ids = scalars_array_value.compute_values( [ diff --git a/tests/system/small/engines/test_windowing.py b/tests/system/small/engines/test_windowing.py index a5f20a47cd..f344a3b60a 100644 --- a/tests/system/small/engines/test_windowing.py +++ b/tests/system/small/engines/test_windowing.py @@ -15,7 +15,14 @@ from google.cloud import bigquery import pytest -from bigframes.core import array_value, expression, identifiers, nodes, window_spec +from bigframes.core import ( + agg_expressions, + array_value, + expression, + identifiers, + nodes, + window_spec, +) import bigframes.operations.aggregations as agg_ops from bigframes.session import direct_gbq_execution, polars_executor from bigframes.testing.engine_utils import assert_equivalence_execution @@ -48,7 +55,9 @@ def test_engines_with_rows_window( ) window_node = nodes.WindowOpNode( child=scalars_array_value.node, - expression=expression.UnaryAggregation(agg_op, expression.deref("int64_too")), + expression=agg_expressions.UnaryAggregation( + agg_op, expression.deref("int64_too") + ), window_spec=window, output_name=identifiers.ColumnId("agg_int64"), never_skip_nulls=never_skip_nulls, diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index a720614892..afd1a74dff 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -736,3 +736,14 @@ def test_getitem_w_struct_array(): expected = bpd.Series(expected_data, dtype=bpd.ArrowDtype((pa_struct))) assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_string_join(session): + pd_series = pd.Series([["a", "b", "c"], ["100"], ["hello", "world"], []]) + bf_series = session.read_pandas(pd_series) + + pd_result = pd_series.str.join("--") + bf_result = bf_series.str.join("--").to_pandas() + + pd_result = pd_result.astype("string[pyarrow]") + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 9ace2dbed7..6d3cf6fe88 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -48,6 +48,9 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi batches = df.to_pandas_batches( page_size=100, ) + assert batches.total_rows > 0 + assert batches.total_bytes_processed is None # No additional query. + executions_after = maybe_ordered_session._metrics.execution_count num_batches = 0 diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index dce0a649f6..323956b038 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -138,6 +138,16 @@ def test_df_construct_structs(session): ) +def test_df_construct_local_concat_pd(scalars_pandas_df_index, session): + pd_df = pd.concat([scalars_pandas_df_index, scalars_pandas_df_index]) + + bf_df = session.read_pandas(pd_df) + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), pd_df, check_index_type=False, check_dtype=False + ) + + def test_df_construct_pandas_set_dtype(scalars_dfs): columns = [ "int64_too", @@ -878,6 +888,30 @@ def test_join_repr(scalars_dfs_maybe_ordered): assert actual == expected +def test_repr_w_display_options(scalars_dfs, session): + metrics = session._metrics + scalars_df, _ = scalars_dfs + # get a pandas df of the expected format + df, _ = scalars_df._block.to_pandas() + pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) + pandas_df.index.name = scalars_df.index.name + + executions_pre = metrics.execution_count + with bigframes.option_context( + "display.max_rows", 10, "display.max_columns", 5, "display.max_colwidth", 10 + ): + + # When there are 10 or fewer rows, the outputs should be identical except for the extra note. + actual = scalars_df.head(10).__repr__() + executions_post = metrics.execution_count + + with display_options.pandas_repr(bigframes.options.display): + pandas_repr = pandas_df.head(10).__repr__() + + assert actual == pandas_repr + assert (executions_post - executions_pre) <= 3 + + def test_repr_html_w_all_rows(scalars_dfs, session): metrics = session._metrics scalars_df, _ = scalars_dfs diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 1d6ae370c5..96d7881d67 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -339,6 +339,13 @@ def test_to_arrow_override_global_option(scalars_df_index): assert scalars_df_index._query_job.destination.table_id == table_id +def test_to_pandas_batches_populates_total_bytes_processed(scalars_df_default_index): + batches = scalars_df_default_index.sort_values( + "int64_col" + ).to_pandas_batches() # Do a sort to force query execution. + assert batches.total_bytes_processed > 0 + + def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): """Verify to_pandas_batches() APIs returns the expected dtypes.""" expected = scalars_df_default_index.dtypes diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py index 96cdceb3c6..d12b4dda17 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py @@ -14,7 +14,7 @@ import pytest -from bigframes.core import array_value, expression, identifiers, nodes +from bigframes.core import agg_expressions, array_value, expression, identifiers, nodes from bigframes.operations import aggregations as agg_ops import bigframes.pandas as bpd @@ -26,7 +26,7 @@ def _apply_unary_op(obj: bpd.DataFrame, op: agg_ops.UnaryWindowOp, arg: str) -> obj._block.expr.node, aggregations=( ( - expression.UnaryAggregation(op, expression.deref(arg)), + agg_expressions.UnaryAggregation(op, expression.deref(arg)), identifiers.ColumnId(arg + "_agg"), ), ), diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_timedelta/out.sql index c8a8cf6cbf..f8752d0a60 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_timedelta/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_mul_timedelta/out.sql @@ -11,7 +11,7 @@ WITH `bfcte_0` AS ( `bfcol_1` AS `bfcol_8`, `bfcol_2` AS `bfcol_9`, `bfcol_0` AS `bfcol_10`, - INTERVAL `bfcol_3` MICROSECOND AS `bfcol_11` + `bfcol_3` AS `bfcol_11` FROM `bfcte_0` ), `bfcte_2` AS ( SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_timedelta/out.sql index 460f941d1b..2d615fcca6 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_timedelta/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_sub_timedelta/out.sql @@ -11,7 +11,7 @@ WITH `bfcte_0` AS ( `bfcol_1` AS `bfcol_8`, `bfcol_2` AS `bfcol_9`, `bfcol_0` AS `bfcol_10`, - INTERVAL `bfcol_3` MICROSECOND AS `bfcol_11` + `bfcol_3` AS `bfcol_11` FROM `bfcte_0` ), `bfcte_2` AS ( SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql index 36941df71b..7a1a2a743d 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_is_in/out.sql @@ -1,13 +1,32 @@ WITH `bfcte_0` AS ( SELECT - `int64_col` AS `bfcol_0` + `int64_col` AS `bfcol_0`, + `float64_col` AS `bfcol_1` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT *, - `bfcol_0` IN (1, 2, 3) AS `bfcol_1` + COALESCE(`bfcol_0` IN (1, 2, 3), FALSE) AS `bfcol_2`, + ( + `bfcol_0` IS NULL + ) OR `bfcol_0` IN (123456) AS `bfcol_3`, + COALESCE(`bfcol_0` IN (1.0, 2.0, 3.0), FALSE) AS `bfcol_4`, + FALSE AS `bfcol_5`, + COALESCE(`bfcol_0` IN (2.5, 3), FALSE) AS `bfcol_6`, + FALSE AS `bfcol_7`, + COALESCE(`bfcol_0` IN (123456), FALSE) AS `bfcol_8`, + ( + `bfcol_1` IS NULL + ) OR `bfcol_1` IN (1, 2, 3) AS `bfcol_9` FROM `bfcte_0` ) SELECT - `bfcol_1` AS `int64_col` + `bfcol_2` AS `ints`, + `bfcol_3` AS `ints_w_null`, + `bfcol_4` AS `floats`, + `bfcol_5` AS `strings`, + `bfcol_6` AS `mixed`, + `bfcol_7` AS `empty`, + `bfcol_8` AS `ints_wo_match_nulls`, + `bfcol_9` AS `float_in_ints` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql index 01ebebc455..057e6c778e 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_timedelta/out.sql @@ -8,7 +8,7 @@ WITH `bfcte_0` AS ( *, `bfcol_1` AS `bfcol_4`, `bfcol_0` AS `bfcol_5`, - INTERVAL `bfcol_0` MICROSECOND AS `bfcol_6` + `bfcol_0` AS `bfcol_6` FROM `bfcte_0` ), `bfcte_2` AS ( SELECT @@ -16,7 +16,7 @@ WITH `bfcte_0` AS ( `bfcol_4` AS `bfcol_10`, `bfcol_5` AS `bfcol_11`, `bfcol_6` AS `bfcol_12`, - INTERVAL (`bfcol_5` * 1000000) MICROSECOND AS `bfcol_13` + `bfcol_5` * 1000000 AS `bfcol_13` FROM `bfcte_1` ), `bfcte_3` AS ( SELECT @@ -25,7 +25,7 @@ WITH `bfcte_0` AS ( `bfcol_11` AS `bfcol_19`, `bfcol_12` AS `bfcol_20`, `bfcol_13` AS `bfcol_21`, - INTERVAL (`bfcol_11` * 604800000000) MICROSECOND AS `bfcol_22` + `bfcol_11` * 604800000000 AS `bfcol_22` FROM `bfcte_2` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py index 815bb84a9a..fced18f5be 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py @@ -370,12 +370,25 @@ def test_invert(scalar_types_df: bpd.DataFrame, snapshot): def test_is_in(scalar_types_df: bpd.DataFrame, snapshot): - col_name = "int64_col" - bf_df = scalar_types_df[[col_name]] - sql = _apply_unary_ops( - bf_df, [ops.IsInOp(values=(1, 2, 3)).as_expr(col_name)], [col_name] - ) + int_col = "int64_col" + float_col = "float64_col" + bf_df = scalar_types_df[[int_col, float_col]] + ops_map = { + "ints": ops.IsInOp(values=(1, 2, 3)).as_expr(int_col), + "ints_w_null": ops.IsInOp(values=(None, 123456)).as_expr(int_col), + "floats": ops.IsInOp(values=(1.0, 2.0, 3.0), match_nulls=False).as_expr( + int_col + ), + "strings": ops.IsInOp(values=("1.0", "2.0")).as_expr(int_col), + "mixed": ops.IsInOp(values=("1.0", 2.5, 3)).as_expr(int_col), + "empty": ops.IsInOp(values=()).as_expr(int_col), + "ints_wo_match_nulls": ops.IsInOp( + values=(None, 123456), match_nulls=False + ).as_expr(int_col), + "float_in_ints": ops.IsInOp(values=(1, 2, 3, None)).as_expr(float_col), + } + sql = _apply_unary_ops(bf_df, list(ops_map.values()), list(ops_map.keys())) snapshot.assert_match(sql, "out.sql") diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 61bafeeca2..9af2a4afe4 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -1088,6 +1088,22 @@ def visit_ArrayAggregate(self, op, *, arg, order_by, where): expr = arg return sge.IgnoreNulls(this=self.agg.array_agg(expr, where=where)) + def visit_StringAgg(self, op, *, arg, sep, order_by, where): + if len(order_by) > 0: + expr = sge.Order( + this=arg, + expressions=[ + # Avoid adding NULLS FIRST / NULLS LAST in SQL, which is + # unsupported in ARRAY_AGG by reconstructing the node as + # plain SQL text. + f"({order_column.args['this'].sql(dialect='bigquery')}) {'DESC' if order_column.args.get('desc') else 'ASC'}" + for order_column in order_by + ], + ) + else: + expr = arg + return self.agg.string_agg(expr, sep, where=where) + def visit_FirstNonNullValue(self, op, *, arg): return sge.IgnoreNulls(this=sge.FirstValue(this=arg)) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index 34f6406e0c..c3f2a03223 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -401,3 +401,20 @@ class ArrayAggregate(Filterable, Reduction): @attribute def dtype(self): return dt.Array(self.arg.dtype) + + +@public +class StringAgg(Filterable, Reduction): + """ + Collects the elements of this expression into a string. Similar to + the ibis `GroupConcat`, but adds `order_by_*` parameter. + """ + + arg: Column + sep: Value[dt.String] + + order_by: VarTuple[Value] = () + + @attribute + def dtype(self): + return dt.string diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index 9b5b461ea5..fe94bf3049 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -1298,3 +1298,43 @@ def center( bigframes.series.Series: Returns Series or Index with minimum number of char in object. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def join(self, sep: str): + """ + Join lists contained as elements in the Series/Index with passed delimiter. + + If the elements of a Series are lists themselves, join the content of these + lists using the delimiter passed to the function. + This function is an equivalent to :meth:`str.join`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import pandas as pd + + Example with a list that contains non-string elements. + + >>> s = bpd.Series([['lion', 'elephant', 'zebra'], + ... ['dragon'], + ... ['duck', 'swan', 'fish', 'guppy']]) + >>> s + 0 ['lion' 'elephant' 'zebra'] + 1 ['dragon'] + 2 ['duck' 'swan' 'fish' 'guppy'] + dtype: list[pyarrow] + + >>> s.str.join('-') + 0 lion-elephant-zebra + 1 dragon + 2 duck-swan-fish-guppy + dtype: string + + Args: + sep (str): + Delimiter to use between list entries. + + Returns: + bigframes.series.Series: The list entries concatenated by intervening occurrences of the delimiter. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 78b6498d2d..558f26d68e 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.18.0" +__version__ = "2.19.0" # {x-release-please-start-date} -__release_date__ = "2025-09-03" +__release_date__ = "2025-09-09" # {x-release-please-end}