Skip to content

Commit 86e54b1

Browse files
shobsitswast
andauthored
fix: support read_gbq_function for axis=1 application (#950)
* fix: support `read_gbq_function` for axis=1 application * remove stray newline * Update bigframes/session/__init__.py * remove first person reference in the doc * use correct product name --------- Co-authored-by: Tim Sweña (Swast) <[email protected]>
1 parent c750be6 commit 86e54b1

File tree

5 files changed

+51
-8
lines changed

5 files changed

+51
-8
lines changed

bigframes/functions/_remote_function_session.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def remote_function(
176176
getting and setting IAM roles on cloud resources. If this param is
177177
not provided then resource manager client from the session would be
178178
used.
179-
dataset (str, Optional.):
179+
dataset (str, Optional):
180180
Dataset in which to create a BigQuery remote function. It should be in
181181
`<project_id>.<dataset_name>` or `<dataset_name>` format. If this
182182
parameter is not provided then session dataset id is used.

bigframes/functions/remote_function.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ def read_gbq_function(
108108
function_name: str,
109109
*,
110110
session: Session,
111+
is_row_processor: bool = False,
111112
):
112113
"""
113114
Read an existing BigQuery function and prepare it for use in future queries.
@@ -194,5 +195,6 @@ def func(*ignored_args, **ignored_kwargs):
194195
func.output_dtype = bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( # type: ignore
195196
ibis_signature.output_type
196197
)
198+
func.is_row_processor = is_row_processor # type: ignore
197199
func.ibis_node = node # type: ignore
198200
return func

bigframes/pandas/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -692,10 +692,11 @@ def remote_function(
692692
remote_function.__doc__ = inspect.getdoc(bigframes.session.Session.remote_function)
693693

694694

695-
def read_gbq_function(function_name: str):
695+
def read_gbq_function(function_name: str, is_row_processor: bool = False):
696696
return global_session.with_default_session(
697697
bigframes.session.Session.read_gbq_function,
698698
function_name=function_name,
699+
is_row_processor=is_row_processor,
699700
)
700701

701702

bigframes/session/__init__.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,6 +1225,7 @@ def remote_function(
12251225
def read_gbq_function(
12261226
self,
12271227
function_name: str,
1228+
is_row_processor: bool = False,
12281229
):
12291230
"""Loads a BigQuery function from BigQuery.
12301231
@@ -1255,7 +1256,7 @@ def read_gbq_function(
12551256
>>> func('AURÉLIE')
12561257
'aurÉlie'
12571258
1258-
You can apply it to a BigQuery DataFrame Series.
1259+
You can apply it to a BigQuery DataFrames Series.
12591260
12601261
>>> df = bpd.DataFrame({'id': [1, 2, 3], 'name': ['AURÉLIE', 'CÉLESTINE', 'DAPHNÉ']})
12611262
>>> df
@@ -1275,13 +1276,33 @@ def read_gbq_function(
12751276
<BLANKLINE>
12761277
[3 rows x 3 columns]
12771278
1278-
You can even use a function with multiple inputs. For example, let's use
1279-
[cw_instr4](https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_instr4source-string-search-string-position-int64-ocurrence-int64)
1279+
You can even use a function with multiple inputs. For example,
1280+
[cw_regexp_replace_5](https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/README.md#cw_regexp_replace_5haystack-string-regexp-string-replacement-string-offset-int64-occurrence-int64)
12801281
from Community UDFs.
12811282
1282-
>>> func = bpd.read_gbq_function("bqutil.fn.cw_instr4")
1283-
>>> func('TestStr123456Str', 'Str', 1, 2)
1284-
14
1283+
>>> func = bpd.read_gbq_function("bqutil.fn.cw_regexp_replace_5")
1284+
>>> func('TestStr123456', 'Str', 'Cad$', 1, 1)
1285+
'TestCad$123456'
1286+
1287+
>>> df = bpd.DataFrame({
1288+
... "haystack" : ["TestStr123456", "TestStr123456Str", "TestStr123456Str"],
1289+
... "regexp" : ["Str", "Str", "Str"],
1290+
... "replacement" : ["Cad$", "Cad$", "Cad$"],
1291+
... "offset" : [1, 1, 1],
1292+
... "occurrence" : [1, 2, 1]
1293+
... })
1294+
>>> df
1295+
haystack regexp replacement offset occurrence
1296+
0 TestStr123456 Str Cad$ 1 1
1297+
1 TestStr123456Str Str Cad$ 1 2
1298+
2 TestStr123456Str Str Cad$ 1 1
1299+
<BLANKLINE>
1300+
[3 rows x 5 columns]
1301+
>>> df.apply(func, axis=1)
1302+
0 TestCad$123456
1303+
1 TestStr123456Cad$
1304+
2 TestCad$123456Str
1305+
dtype: string
12851306
12861307
Args:
12871308
function_name (str):
@@ -1290,6 +1311,10 @@ def read_gbq_function(
12901311
`dataset_id.function_name` to load from the default project, or
12911312
`function_name` to load from the default project and the dataset
12921313
associated with the current session.
1314+
is_row_processor (bool, default False):
1315+
Whether the function is a row processor. This is set to True
1316+
for a function which receives an entire row of a DataFrame as
1317+
a pandas Series.
12931318
12941319
Returns:
12951320
callable: A function object pointing to the BigQuery function read
@@ -1303,6 +1328,7 @@ def read_gbq_function(
13031328
return bigframes_rf.read_gbq_function(
13041329
function_name=function_name,
13051330
session=self,
1331+
is_row_processor=is_row_processor,
13061332
)
13071333

13081334
def _prepare_copy_job_config(self) -> bigquery.CopyJobConfig:

tests/system/large/test_remote_function.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1603,6 +1603,13 @@ def serialize_row(row):
16031603
# bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object'
16041604
# , ignore this mismatch by using check_dtype=False.
16051605
pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
1606+
1607+
# Let's make sure the read_gbq_function path works for this function
1608+
serialize_row_reuse = session.read_gbq_function(
1609+
serialize_row_remote.bigframes_remote_function, is_row_processor=True
1610+
)
1611+
bf_result = scalars_df[columns].apply(serialize_row_reuse, axis=1).to_pandas()
1612+
pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
16061613
finally:
16071614
# clean up the gcp assets created for the remote function
16081615
cleanup_remote_function_assets(
@@ -2085,6 +2092,13 @@ def foo(x, y, z):
20852092
pandas.testing.assert_series_equal(
20862093
expected_result, bf_result, check_dtype=False, check_index_type=False
20872094
)
2095+
2096+
# Let's make sure the read_gbq_function path works for this function
2097+
foo_reuse = session.read_gbq_function(foo.bigframes_remote_function)
2098+
bf_result = bf_df.apply(foo_reuse, axis=1).to_pandas()
2099+
pandas.testing.assert_series_equal(
2100+
expected_result, bf_result, check_dtype=False, check_index_type=False
2101+
)
20882102
finally:
20892103
# clean up the gcp assets created for the remote function
20902104
cleanup_remote_function_assets(

0 commit comments

Comments
 (0)