Skip to content

Commit 580e1b9

Browse files
feat: Add Series.peek to preview data efficiently (#727)
Co-authored-by: Tim Sweña (Swast) <[email protected]>
1 parent 487dff6 commit 580e1b9

File tree

9 files changed

+506
-23
lines changed

9 files changed

+506
-23
lines changed

bigframes/core/blocks.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2286,13 +2286,13 @@ def to_sql_query(
22862286
idx_labels,
22872287
)
22882288

2289-
def cached(self, *, optimize_offsets=False, force: bool = False) -> None:
2289+
def cached(self, *, force: bool = False, session_aware: bool = False) -> None:
22902290
"""Write the block to a session table."""
22912291
# use a heuristic for whether something needs to be cached
22922292
if (not force) and self.session._is_trivially_executable(self.expr):
22932293
return
2294-
if optimize_offsets:
2295-
self.session._cache_with_offsets(self.expr)
2294+
elif session_aware:
2295+
self.session._cache_with_session_awareness(self.expr)
22962296
else:
22972297
self.session._cache_with_cluster_cols(
22982298
self.expr, cluster_cols=self.index_columns

bigframes/core/pruning.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import bigframes.core.expression as ex
16+
import bigframes.core.schema as schemata
17+
import bigframes.dtypes
18+
import bigframes.operations as ops
19+
20+
LOW_CARDINALITY_TYPES = [bigframes.dtypes.BOOL_DTYPE]
21+
22+
COMPARISON_OP_TYPES = tuple(
23+
type(i)
24+
for i in (
25+
ops.eq_op,
26+
ops.eq_null_match_op,
27+
ops.ne_op,
28+
ops.gt_op,
29+
ops.ge_op,
30+
ops.lt_op,
31+
ops.le_op,
32+
)
33+
)
34+
35+
36+
def cluster_cols_for_predicate(
37+
predicate: ex.Expression, schema: schemata.ArraySchema
38+
) -> list[str]:
39+
"""Try to determine cluster col candidates that work with given predicates."""
40+
# TODO: Prioritize based on predicted selectivity (eg. equality conditions are probably very selective)
41+
if isinstance(predicate, ex.UnboundVariableExpression):
42+
cols = [predicate.id]
43+
elif isinstance(predicate, ex.OpExpression):
44+
op = predicate.op
45+
# TODO: Support geo predicates, which support pruning if clustered (other than st_disjoint)
46+
# https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions
47+
if isinstance(op, COMPARISON_OP_TYPES):
48+
cols = cluster_cols_for_comparison(predicate.inputs[0], predicate.inputs[1])
49+
elif isinstance(op, (type(ops.invert_op))):
50+
cols = cluster_cols_for_predicate(predicate.inputs[0], schema)
51+
elif isinstance(op, (type(ops.and_op), type(ops.or_op))):
52+
left_cols = cluster_cols_for_predicate(predicate.inputs[0], schema)
53+
right_cols = cluster_cols_for_predicate(predicate.inputs[1], schema)
54+
cols = [*left_cols, *[col for col in right_cols if col not in left_cols]]
55+
else:
56+
cols = []
57+
else:
58+
# Constant
59+
cols = []
60+
return [
61+
col for col in cols if bigframes.dtypes.is_clusterable(schema.get_type(col))
62+
]
63+
64+
65+
def cluster_cols_for_comparison(
66+
left_ex: ex.Expression, right_ex: ex.Expression
67+
) -> list[str]:
68+
# TODO: Try to normalize expressions such that one side is a single variable.
69+
# eg. Convert -cola>=3 to cola<-3 and colb+3 < 4 to colb < 1
70+
if left_ex.is_const:
71+
# There are some invertible ops that would also be ok
72+
if isinstance(right_ex, ex.UnboundVariableExpression):
73+
return [right_ex.id]
74+
elif right_ex.is_const:
75+
if isinstance(left_ex, ex.UnboundVariableExpression):
76+
return [left_ex.id]
77+
return []

bigframes/core/tree_properties.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
import functools
1717
import itertools
18-
from typing import Callable, Dict, Optional
18+
from typing import Callable, Dict, Optional, Sequence
1919

2020
import bigframes.core.nodes as nodes
2121

@@ -91,6 +91,43 @@ def _node_counts_inner(
9191
)
9292

9393

94+
def count_nodes(forest: Sequence[nodes.BigFrameNode]) -> dict[nodes.BigFrameNode, int]:
95+
"""
96+
Counts the number of instances of each subtree present within a forest.
97+
98+
Memoizes internally to accelerate execution, but cache not persisted (not reused between invocations).
99+
100+
Args:
101+
forest (Sequence of BigFrameNode):
102+
The roots of each tree in the forest
103+
104+
Returns:
105+
dict[BigFramesNode, int]: The number of occurences of each subtree.
106+
"""
107+
108+
def _combine_counts(
109+
left: Dict[nodes.BigFrameNode, int], right: Dict[nodes.BigFrameNode, int]
110+
) -> Dict[nodes.BigFrameNode, int]:
111+
return {
112+
key: left.get(key, 0) + right.get(key, 0)
113+
for key in itertools.chain(left.keys(), right.keys())
114+
}
115+
116+
empty_counts: Dict[nodes.BigFrameNode, int] = {}
117+
118+
@functools.cache
119+
def _node_counts_inner(
120+
subtree: nodes.BigFrameNode,
121+
) -> Dict[nodes.BigFrameNode, int]:
122+
"""Helper function to count occurences of duplicate nodes in a subtree. Considers only nodes in a complexity range"""
123+
child_counts = [_node_counts_inner(child) for child in subtree.child_nodes]
124+
node_counts = functools.reduce(_combine_counts, child_counts, empty_counts)
125+
return _combine_counts(node_counts, {subtree: 1})
126+
127+
counts = [_node_counts_inner(root) for root in forest]
128+
return functools.reduce(_combine_counts, counts, empty_counts)
129+
130+
94131
def replace_nodes(
95132
root: nodes.BigFrameNode,
96133
replacements: dict[nodes.BigFrameNode, nodes.BigFrameNode],

bigframes/dtypes.py

Lines changed: 68 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,52 +74,95 @@ class SimpleDtypeInfo:
7474
logical_bytes: int = (
7575
8 # this is approximate only, some types are variably sized, also, compression
7676
)
77+
orderable: bool = False
78+
clusterable: bool = False
7779

7880

7981
# TODO: Missing BQ types: INTERVAL, JSON, RANGE
8082
# TODO: Add mappings to python types
8183
SIMPLE_TYPES = (
8284
SimpleDtypeInfo(
83-
dtype=INT_DTYPE, arrow_dtype=pa.int64(), type_kind=("INT64", "INTEGER")
85+
dtype=INT_DTYPE,
86+
arrow_dtype=pa.int64(),
87+
type_kind=("INT64", "INTEGER"),
88+
orderable=True,
89+
clusterable=True,
8490
),
8591
SimpleDtypeInfo(
86-
dtype=FLOAT_DTYPE, arrow_dtype=pa.float64(), type_kind=("FLOAT64", "FLOAT")
92+
dtype=FLOAT_DTYPE,
93+
arrow_dtype=pa.float64(),
94+
type_kind=("FLOAT64", "FLOAT"),
95+
orderable=True,
8796
),
8897
SimpleDtypeInfo(
8998
dtype=BOOL_DTYPE,
9099
arrow_dtype=pa.bool_(),
91100
type_kind=("BOOL", "BOOLEAN"),
92101
logical_bytes=1,
102+
orderable=True,
103+
clusterable=True,
93104
),
94-
SimpleDtypeInfo(dtype=STRING_DTYPE, arrow_dtype=pa.string(), type_kind=("STRING",)),
95105
SimpleDtypeInfo(
96-
dtype=DATE_DTYPE, arrow_dtype=pa.date32(), type_kind=("DATE",), logical_bytes=4
106+
dtype=STRING_DTYPE,
107+
arrow_dtype=pa.string(),
108+
type_kind=("STRING",),
109+
orderable=True,
110+
clusterable=True,
97111
),
98-
SimpleDtypeInfo(dtype=TIME_DTYPE, arrow_dtype=pa.time64("us"), type_kind=("TIME",)),
99112
SimpleDtypeInfo(
100-
dtype=DATETIME_DTYPE, arrow_dtype=pa.timestamp("us"), type_kind=("DATETIME",)
113+
dtype=DATE_DTYPE,
114+
arrow_dtype=pa.date32(),
115+
type_kind=("DATE",),
116+
logical_bytes=4,
117+
orderable=True,
118+
clusterable=True,
119+
),
120+
SimpleDtypeInfo(
121+
dtype=TIME_DTYPE,
122+
arrow_dtype=pa.time64("us"),
123+
type_kind=("TIME",),
124+
orderable=True,
125+
),
126+
SimpleDtypeInfo(
127+
dtype=DATETIME_DTYPE,
128+
arrow_dtype=pa.timestamp("us"),
129+
type_kind=("DATETIME",),
130+
orderable=True,
131+
clusterable=True,
101132
),
102133
SimpleDtypeInfo(
103134
dtype=TIMESTAMP_DTYPE,
104135
arrow_dtype=pa.timestamp("us", tz="UTC"),
105136
type_kind=("TIMESTAMP",),
137+
orderable=True,
138+
clusterable=True,
139+
),
140+
SimpleDtypeInfo(
141+
dtype=BYTES_DTYPE, arrow_dtype=pa.binary(), type_kind=("BYTES",), orderable=True
106142
),
107-
SimpleDtypeInfo(dtype=BYTES_DTYPE, arrow_dtype=pa.binary(), type_kind=("BYTES",)),
108143
SimpleDtypeInfo(
109144
dtype=NUMERIC_DTYPE,
110145
arrow_dtype=pa.decimal128(38, 9),
111146
type_kind=("NUMERIC",),
112147
logical_bytes=16,
148+
orderable=True,
149+
clusterable=True,
113150
),
114151
SimpleDtypeInfo(
115152
dtype=BIGNUMERIC_DTYPE,
116153
arrow_dtype=pa.decimal256(76, 38),
117154
type_kind=("BIGNUMERIC",),
118155
logical_bytes=32,
156+
orderable=True,
157+
clusterable=True,
119158
),
120159
# Geo has no corresponding arrow dtype
121160
SimpleDtypeInfo(
122-
dtype=GEO_DTYPE, arrow_dtype=None, type_kind=("GEOGRAPHY",), logical_bytes=40
161+
dtype=GEO_DTYPE,
162+
arrow_dtype=None,
163+
type_kind=("GEOGRAPHY",),
164+
logical_bytes=40,
165+
clusterable=True,
123166
),
124167
)
125168

@@ -209,9 +252,25 @@ def is_comparable(type: ExpressionType) -> bool:
209252
return (type is not None) and is_orderable(type)
210253

211254

255+
_ORDERABLE_SIMPLE_TYPES = set(
256+
mapping.dtype for mapping in SIMPLE_TYPES if mapping.orderable
257+
)
258+
259+
212260
def is_orderable(type: ExpressionType) -> bool:
213261
# On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable
214-
return not is_array_like(type) and not is_struct_like(type) and (type != GEO_DTYPE)
262+
return type in _ORDERABLE_SIMPLE_TYPES
263+
264+
265+
_CLUSTERABLE_SIMPLE_TYPES = set(
266+
mapping.dtype for mapping in SIMPLE_TYPES if mapping.clusterable
267+
)
268+
269+
270+
def is_clusterable(type: ExpressionType) -> bool:
271+
# https://cloud.google.com/bigquery/docs/clustered-tables#cluster_column_types
272+
# This is based on default database type mapping, could in theory represent in non-default bq type to cluster.
273+
return type in _CLUSTERABLE_SIMPLE_TYPES
215274

216275

217276
def is_bool_coercable(type: ExpressionType) -> bool:

bigframes/series.py

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,40 @@ def head(self, n: int = 5) -> Series:
623623
def tail(self, n: int = 5) -> Series:
624624
return typing.cast(Series, self.iloc[-n:])
625625

626+
def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame:
627+
"""
628+
Preview n arbitrary elements from the series without guarantees about row selection or ordering.
629+
630+
``Series.peek(force=False)`` will always be very fast, but will not succeed if data requires
631+
full data scanning. Using ``force=True`` will always succeed, but may be perform queries.
632+
Query results will be cached so that future steps will benefit from these queries.
633+
634+
Args:
635+
n (int, default 5):
636+
The number of rows to select from the series. Which N rows are returned is non-deterministic.
637+
force (bool, default True):
638+
If the data cannot be peeked efficiently, the series will instead be fully materialized as part
639+
of the operation if ``force=True``. If ``force=False``, the operation will throw a ValueError.
640+
Returns:
641+
pandas.Series: A pandas Series with n rows.
642+
643+
Raises:
644+
ValueError: If force=False and data cannot be efficiently peeked.
645+
"""
646+
maybe_result = self._block.try_peek(n)
647+
if maybe_result is None:
648+
if force:
649+
self._cached()
650+
maybe_result = self._block.try_peek(n, force=True)
651+
assert maybe_result is not None
652+
else:
653+
raise ValueError(
654+
"Cannot peek efficiently when data has aggregates, joins or window functions applied. Use force=True to fully compute dataframe."
655+
)
656+
as_series = maybe_result.squeeze(axis=1)
657+
as_series.name = self.name
658+
return as_series
659+
626660
def nlargest(self, n: int = 5, keep: str = "first") -> Series:
627661
if keep not in ("first", "last", "all"):
628662
raise ValueError("'keep must be one of 'first', 'last', or 'all'")
@@ -1419,7 +1453,7 @@ def apply(
14191453

14201454
# return Series with materialized result so that any error in the remote
14211455
# function is caught early
1422-
materialized_series = result_series._cached()
1456+
materialized_series = result_series._cached(session_aware=False)
14231457
return materialized_series
14241458

14251459
def combine(
@@ -1794,10 +1828,11 @@ def cache(self):
17941828
Returns:
17951829
Series: Self
17961830
"""
1797-
return self._cached(force=True)
1831+
# Do not use session-aware cashing if user-requested
1832+
return self._cached(force=True, session_aware=False)
17981833

1799-
def _cached(self, *, force: bool = True) -> Series:
1800-
self._block.cached(force=force)
1834+
def _cached(self, *, force: bool = True, session_aware: bool = True) -> Series:
1835+
self._block.cached(force=force, session_aware=session_aware)
18011836
return self
18021837

18031838
def _optimize_query_complexity(self):

0 commit comments

Comments
 (0)