Skip to content

Commit 863d694

Browse files
feat: add bigframes.bigquery.create_vector_index to assist in creating vector index on ARRAY<FLOAT64> columns (#1024)
* feat: add `bigframes.bigquery.create_vector_index` to assist in creating vector index on `ARRAY<FLOAT64>` columns * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix lint errors * format * fix mypy * fix test for older google-cloud-bigquery * fix type error * fix typing * use googlesql.identifier to escape column and table ids * wait for job to finish --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 7003d1a commit 863d694

File tree

10 files changed

+836
-485
lines changed

10 files changed

+836
-485
lines changed

bigframes/bigquery/__init__.py

Lines changed: 26 additions & 482 deletions
Large diffs are not rendered by default.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import bigframes.operations.aggregations as agg_ops
18+
import bigframes.series as series
19+
20+
"""
21+
Approximate functions defined from
22+
https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions
23+
"""
24+
25+
26+
def approx_top_count(
27+
series: series.Series,
28+
number: int,
29+
) -> series.Series:
30+
"""Returns the approximate top elements of `expression` as an array of STRUCTs.
31+
The number parameter specifies the number of elements returned.
32+
33+
Each `STRUCT` contains two fields. The first field (named `value`) contains an input
34+
value. The second field (named `count`) contains an `INT64` specifying the number
35+
of times the value was returned.
36+
37+
Returns `NULL` if there are zero input rows.
38+
39+
**Examples:**
40+
41+
>>> import bigframes.pandas as bpd
42+
>>> import bigframes.bigquery as bbq
43+
>>> bpd.options.display.progress_bar = None
44+
>>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"])
45+
>>> bbq.approx_top_count(s, number=2)
46+
[{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}]
47+
48+
Args:
49+
series (bigframes.series.Series):
50+
The Series with any data type that the `GROUP BY` clause supports.
51+
number (int):
52+
An integer specifying the number of times the value was returned.
53+
54+
Returns:
55+
bigframes.series.Series: A new Series with the result data.
56+
"""
57+
if number < 1:
58+
raise ValueError("The number of approx_top_count must be at least 1")
59+
return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number))
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
Array functions defined from
17+
https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions
18+
"""
19+
20+
21+
from __future__ import annotations
22+
23+
import typing
24+
25+
import bigframes_vendored.constants as constants
26+
27+
import bigframes.core.groupby as groupby
28+
import bigframes.operations as ops
29+
import bigframes.operations.aggregations as agg_ops
30+
import bigframes.series as series
31+
32+
if typing.TYPE_CHECKING:
33+
import bigframes.dataframe as dataframe
34+
35+
36+
def array_length(series: series.Series) -> series.Series:
37+
"""Compute the length of each array element in the Series.
38+
39+
**Examples:**
40+
41+
>>> import bigframes.pandas as bpd
42+
>>> import bigframes.bigquery as bbq
43+
>>> bpd.options.display.progress_bar = None
44+
45+
>>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]])
46+
>>> bbq.array_length(s)
47+
0 4
48+
1 0
49+
2 2
50+
dtype: Int64
51+
52+
You can also apply this function directly to Series.
53+
54+
>>> s.apply(bbq.array_length, by_row=False)
55+
0 4
56+
1 0
57+
2 2
58+
dtype: Int64
59+
60+
Args:
61+
series (bigframes.series.Series): A Series with array columns.
62+
63+
Returns:
64+
bigframes.series.Series: A Series of integer values indicating
65+
the length of each element in the Series.
66+
67+
"""
68+
return series._apply_unary_op(ops.len_op)
69+
70+
71+
def array_agg(
72+
obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy,
73+
) -> series.Series | dataframe.DataFrame:
74+
"""Group data and create arrays from selected columns, omitting NULLs to avoid
75+
BigQuery errors (NULLs not allowed in arrays).
76+
77+
**Examples:**
78+
79+
>>> import bigframes.pandas as bpd
80+
>>> import bigframes.bigquery as bbq
81+
>>> import numpy as np
82+
>>> bpd.options.display.progress_bar = None
83+
84+
For a SeriesGroupBy object:
85+
86+
>>> lst = ['a', 'a', 'b', 'b', 'a']
87+
>>> s = bpd.Series([1, 2, 3, 4, np.nan], index=lst)
88+
>>> bbq.array_agg(s.groupby(level=0))
89+
a [1. 2.]
90+
b [3. 4.]
91+
dtype: list<item: double>[pyarrow]
92+
93+
For a DataFrameGroupBy object:
94+
95+
>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
96+
>>> df = bpd.DataFrame(l, columns=["a", "b", "c"])
97+
>>> bbq.array_agg(df.groupby(by=["b"]))
98+
a c
99+
b
100+
1.0 [2] [3]
101+
2.0 [1 1] [3 2]
102+
<BLANKLINE>
103+
[2 rows x 2 columns]
104+
105+
Args:
106+
obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
107+
A GroupBy object to be applied the function.
108+
109+
Returns:
110+
bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or
111+
DataFrame containing aggregated array columns, and indexed by the
112+
original group columns.
113+
"""
114+
if isinstance(obj, groupby.SeriesGroupBy):
115+
return obj._aggregate(agg_ops.ArrayAggOp())
116+
elif isinstance(obj, groupby.DataFrameGroupBy):
117+
return obj._aggregate_all(agg_ops.ArrayAggOp(), numeric_only=False)
118+
else:
119+
raise ValueError(
120+
f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}"
121+
)
122+
123+
124+
def array_to_string(series: series.Series, delimiter: str) -> series.Series:
125+
"""Converts array elements within a Series into delimited strings.
126+
127+
**Examples:**
128+
129+
>>> import bigframes.pandas as bpd
130+
>>> import bigframes.bigquery as bbq
131+
>>> import numpy as np
132+
>>> bpd.options.display.progress_bar = None
133+
134+
>>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]])
135+
>>> bbq.array_to_string(s, delimiter=", ")
136+
0 H, i, !
137+
1 Hello, World
138+
2
139+
3
140+
4 Hi
141+
dtype: string
142+
143+
Args:
144+
series (bigframes.series.Series): A Series containing arrays.
145+
delimiter (str): The string used to separate array elements.
146+
147+
Returns:
148+
bigframes.series.Series: A Series containing delimited strings.
149+
150+
"""
151+
return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
"""
17+
JSON functions defined from
18+
https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions
19+
"""
20+
21+
22+
from __future__ import annotations
23+
24+
from typing import Any, Sequence, Tuple
25+
26+
import bigframes.operations as ops
27+
import bigframes.series as series
28+
29+
30+
def json_set(
31+
series: series.Series,
32+
json_path_value_pairs: Sequence[Tuple[str, Any]],
33+
) -> series.Series:
34+
"""Produces a new JSON value within a Series by inserting or replacing values at
35+
specified paths.
36+
37+
**Examples:**
38+
39+
>>> import bigframes.pandas as bpd
40+
>>> import bigframes.bigquery as bbq
41+
>>> import numpy as np
42+
>>> bpd.options.display.progress_bar = None
43+
44+
>>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
45+
>>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
46+
0 {"a":100,"b":"hi"}
47+
Name: data, dtype: string
48+
49+
Args:
50+
series (bigframes.series.Series):
51+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
52+
json_path_value_pairs (Sequence[Tuple[str, Any]]):
53+
Pairs of JSON path and the new value to insert/replace.
54+
55+
Returns:
56+
bigframes.series.Series: A new Series with the transformed JSON data.
57+
58+
"""
59+
# SQLGlot parser does not support the "create_if_missing => true" syntax, so
60+
# create_if_missing is not currently implemented.
61+
62+
for json_path_value_pair in json_path_value_pairs:
63+
if len(json_path_value_pair) != 2:
64+
raise ValueError(
65+
"Incorrect format: Expected (<json_path>, <json_value>), but found: "
66+
+ f"{json_path_value_pair}"
67+
)
68+
69+
json_path, json_value = json_path_value_pair
70+
series = series._apply_binary_op(
71+
json_value, ops.JSONSet(json_path=json_path), alignment="left"
72+
)
73+
return series
74+
75+
76+
def json_extract(
77+
series: series.Series,
78+
json_path: str,
79+
) -> series.Series:
80+
"""Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
81+
value. This function uses single quotes and brackets to escape invalid JSONPath
82+
characters in JSON keys.
83+
84+
**Examples:**
85+
86+
>>> import bigframes.pandas as bpd
87+
>>> import bigframes.bigquery as bbq
88+
>>> bpd.options.display.progress_bar = None
89+
90+
>>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}'])
91+
>>> bbq.json_extract(s, json_path="$.class")
92+
0 {"students":[{"id":5},{"id":12}]}
93+
dtype: string
94+
95+
Args:
96+
series (bigframes.series.Series):
97+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
98+
json_path (str):
99+
The JSON path identifying the data that you want to obtain from the input.
100+
101+
Returns:
102+
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
103+
"""
104+
return series._apply_unary_op(ops.JSONExtract(json_path=json_path))
105+
106+
107+
def json_extract_array(
108+
series: series.Series,
109+
json_path: str = "$",
110+
) -> series.Series:
111+
"""Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
112+
values. This function uses single quotes and brackets to escape invalid JSONPath
113+
characters in JSON keys.
114+
115+
**Examples:**
116+
117+
>>> import bigframes.pandas as bpd
118+
>>> import bigframes.bigquery as bbq
119+
>>> bpd.options.display.progress_bar = None
120+
121+
>>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
122+
>>> bbq.json_extract_array(s)
123+
0 ['1' '2' '3']
124+
1 ['4' '5']
125+
dtype: list<item: string>[pyarrow]
126+
127+
Args:
128+
series (bigframes.series.Series):
129+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
130+
json_path (str):
131+
The JSON path identifying the data that you want to obtain from the input.
132+
133+
Returns:
134+
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
135+
"""
136+
return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))

0 commit comments

Comments
 (0)