Skip to content

Commit 8fab755

Browse files
authored
chore: sync changes from internal repo (#15)
feat: support `DataFrame.isin` with list and dict inputs test: move flaky `reset_session` test to unit tests chore: don't run redundant tests in nightly build feat: support `DataFrame`-`DataFrame` binary operations feat: support `Series.map` feat: support `Index.is_monotonic` docs: update remote function notebook with read_gbq_function usage feat: use default session and connection in `ml.llm` and `ml.imported` chore: disable broken stack tests feat: support `pow()` and power operator in `DataFrame` and `Series` feat: support for `np.add`, `np.subtract`, `np.multiply`, `np.divide`, `np.power` perf: use `row_number()` filter for `head` / `tail` feat: support `bigframes.pandas.merge()` fix: make `X_train` argument names consistent across methods chore: refactor ml core feat: add `Series.dropna` and `DataFrame.fillna` chore: fix gcs notebooks upload in 'nightly' build chore: fix Kokoro build files to support GitHub directories chore: fix unit test to not require authentication
1 parent cccac8c commit 8fab755

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+2617
-1002
lines changed

.kokoro/build.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,14 @@
1515

1616
set -eo pipefail
1717

18+
if [[ -z "${KOKORO_GOB_COMMIT}" ]]; then
19+
PROJECT_SCM="github"
20+
else
21+
PROJECT_SCM="git"
22+
fi
23+
1824
if [[ -z "${PROJECT_ROOT:-}" ]]; then
19-
PROJECT_ROOT="${KOKORO_ARTIFACTS_DIR}/git/bigframes"
25+
PROJECT_ROOT="${KOKORO_ARTIFACTS_DIR}/${PROJECT_SCM}/bigframes"
2026
fi
2127

2228
cd "${PROJECT_ROOT}"

.kokoro/continuous/nightly.cfg

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
11
# Format: //devtools/kokoro/config/proto/build.proto
22

3-
env_vars: {
4-
key: "NOX_SESSION"
5-
value: "unit system cover lint lint_setup_py mypy format docs e2e notebook"
6-
}
7-
83
build_file: "bigframes/.kokoro/release-nightly.sh"

.kokoro/release-nightly.sh

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,14 @@ while [ $# -gt 0 ] ; do
3434
shift 1;
3535
done
3636

37+
if [[ -z "${KOKORO_GOB_COMMIT}" ]]; then
38+
PROJECT_SCM="github"
39+
else
40+
PROJECT_SCM="git"
41+
fi
42+
3743
if [ -z "${PROJECT_ROOT:-}" ]; then
38-
PROJECT_ROOT="${KOKORO_ARTIFACTS_DIR}/git/bigframes"
44+
PROJECT_ROOT="${KOKORO_ARTIFACTS_DIR}/${PROJECT_SCM}/bigframes"
3945
fi
4046

4147
# Move into the package, build the distribution and upload to shared bucket.
@@ -57,16 +63,6 @@ export PYTHONUNBUFFERED=1
5763
# Install dependencies, as the following steps depend on it
5864
python3.10 -m pip install -e .[all]
5965

60-
# If NOX_SESSION is set, it only runs the specified session,
61-
# otherwise run all the sessions.
62-
if ! [ ${DRY_RUN} ]; then
63-
if [ -n "${NOX_SESSION:-}" ]; then
64-
python3.10 -m nox -s ${NOX_SESSION:-}
65-
else
66-
python3.10 -m nox
67-
fi
68-
fi
69-
7066
# Generate third party notices and include it in the licenses in setup.cfg
7167
# TODO(shobs): Don't include it in the package once vertex colab can pick it
7268
# from elsewhere
@@ -138,15 +134,8 @@ if ! [ ${DRY_RUN} ]; then
138134
gsutil cp -v dist/* ${gcs_path}
139135
gsutil cp -v LICENSE ${gcs_path}
140136
gsutil cp -v ${THIRD_PARTY_NOTICES_FILE} ${gcs_path}
141-
gsutil -m cp -v "notebooks/00 - Summary.ipynb" \
142-
"notebooks/01 - Getting Started.ipynb" \
143-
"notebooks/02 - DataFrame.ipynb" \
144-
"notebooks/03 - Using ML - ML fundamentals.ipynb" \
145-
"notebooks/04 - Using ML - SKLearn linear regression.ipynb" \
146-
"notebooks/05 - Using ML - Easy linear regression.ipynb" \
147-
"notebooks/06 - Using ML - Large Language Models.ipynb" \
148-
"notebooks/50 - Remote Function.ipynb" \
149-
${gcs_path}notebooks/
137+
gsutil -m cp -r -v "notebooks/" ${gcs_path}notebooks/
138+
150139
done
151140

152141
# publish API coverage information to BigQuery

OWNERS

Lines changed: 1 addition & 0 deletions

bigframes/_config/bigquery_options.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,13 @@ def __init__(
3535
credentials: Optional[google.auth.credentials.Credentials] = None,
3636
project: Optional[str] = None,
3737
location: Optional[str] = None,
38-
remote_udf_connection: Optional[str] = None,
38+
bq_connection: Optional[str] = None,
3939
use_regional_endpoints: bool = False,
4040
):
4141
self._credentials = credentials
4242
self._project = project
4343
self._location = location
44-
self._remote_udf_connection = remote_udf_connection
44+
self._bq_connection = bq_connection
4545
self._use_regional_endpoints = use_regional_endpoints
4646
self._session_started = False
4747

@@ -82,23 +82,21 @@ def project(self, value: Optional[str]):
8282
self._project = value
8383

8484
@property
85-
def remote_udf_connection(self) -> Optional[str]:
86-
"""Name of the BigQuery connection to use for remote functions.
85+
def bq_connection(self) -> Optional[str]:
86+
"""Name of the BigQuery connection to use.
8787
8888
You should either have the connection already created in the
8989
<code>location</code> you have chosen, or you should have the Project IAM
9090
Admin role to enable the service to create the connection for you if you
9191
need it.
9292
"""
93-
return self._remote_udf_connection
93+
return self._bq_connection
9494

95-
@remote_udf_connection.setter
96-
def remote_udf_connection(self, value: Optional[str]):
97-
if self._session_started and self._remote_udf_connection != value:
98-
raise ValueError(
99-
SESSION_STARTED_MESSAGE.format(attribute="remote_udf_connection")
100-
)
101-
self._remote_udf_connection = value
95+
@bq_connection.setter
96+
def bq_connection(self, value: Optional[str]):
97+
if self._session_started and self._bq_connection != value:
98+
raise ValueError(SESSION_STARTED_MESSAGE.format(attribute="bq_connection"))
99+
self._bq_connection = value
102100

103101
@property
104102
def use_regional_endpoints(self) -> bool:

bigframes/clients.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""BigQuery DataFrame clients to interact with other cloud resources"""
16+
17+
from __future__ import annotations
18+
19+
import logging
20+
import time
21+
from typing import Optional
22+
23+
import google.api_core.exceptions
24+
from google.cloud import bigquery_connection_v1, resourcemanager_v3
25+
from google.iam.v1 import iam_policy_pb2, policy_pb2
26+
27+
logging.basicConfig(
28+
level=logging.INFO, format="[%(levelname)s][%(asctime)s][%(name)s] %(message)s"
29+
)
30+
logger = logging.getLogger(__name__)
31+
32+
33+
class BqConnectionManager:
34+
"""Manager to handle operations with BQ connections."""
35+
36+
# Wait time (in seconds) for an IAM binding to take effect after creation
37+
_IAM_WAIT_SECONDS = 120
38+
39+
def __init__(
40+
self,
41+
bq_connection_client: bigquery_connection_v1.ConnectionServiceClient,
42+
cloud_resource_manager_client: resourcemanager_v3.ProjectsClient,
43+
):
44+
self._bq_connection_client = bq_connection_client
45+
self._cloud_resource_manager_client = cloud_resource_manager_client
46+
47+
def create_bq_connection(
48+
self, project_id: str, location: str, connection_id: str, iam_role: str
49+
):
50+
"""Create the BQ connection if not exist. In addition, try to add the IAM role to the connection to ensure required permissions.
51+
52+
Args:
53+
project_id:
54+
ID of the project.
55+
location:
56+
Location of the connection.
57+
connection_id:
58+
ID of the connection.
59+
iam_role:
60+
str of the IAM role that the service account of the created connection needs to aquire. E.g. 'run.invoker', 'aiplatform.user'
61+
"""
62+
# TODO(shobs): The below command to enable BigQuery Connection API needs
63+
# to be automated. Disabling for now since most target users would not
64+
# have the privilege to enable API in a project.
65+
# log("Making sure BigQuery Connection API is enabled")
66+
# if os.system("gcloud services enable bigqueryconnection.googleapis.com"):
67+
# raise ValueError("Failed to enable BigQuery Connection API")
68+
# If the intended connection does not exist then create it
69+
service_account_id = self._get_service_account_if_connection_exists(
70+
project_id, location, connection_id
71+
)
72+
if service_account_id:
73+
logger.info(
74+
f"Connector {project_id}.{location}.{connection_id} already exists"
75+
)
76+
else:
77+
connection_name, service_account_id = self._create_bq_connection(
78+
project_id, location, connection_id
79+
)
80+
logger.info(
81+
f"Created BQ connection {connection_name} with service account id: {service_account_id}"
82+
)
83+
# Ensure IAM role on the BQ connection
84+
# https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function
85+
self._ensure_iam_binding(project_id, service_account_id, iam_role)
86+
87+
# Introduce retries to accommodate transient errors like etag mismatch,
88+
# which can be caused by concurrent operation on the same resource, and
89+
# manifests with message like:
90+
# google.api_core.exceptions.Aborted: 409 There were concurrent policy
91+
# changes. Please retry the whole read-modify-write with exponential
92+
# backoff. The request's ETag '\007\006\003,\264\304\337\272' did not match
93+
# the current policy's ETag '\007\006\003,\3750&\363'.
94+
@google.api_core.retry.Retry(
95+
predicate=google.api_core.retry.if_exception_type(
96+
google.api_core.exceptions.Aborted
97+
),
98+
initial=10,
99+
maximum=20,
100+
multiplier=2,
101+
timeout=60,
102+
)
103+
def _ensure_iam_binding(
104+
self, project_id: str, service_account_id: str, iam_role: str
105+
):
106+
"""Ensure necessary IAM role is configured on a service account."""
107+
project = f"projects/{project_id}"
108+
service_account = f"serviceAccount:{service_account_id}"
109+
role = f"roles/{iam_role}"
110+
request = iam_policy_pb2.GetIamPolicyRequest(resource=project)
111+
policy = self._cloud_resource_manager_client.get_iam_policy(request=request)
112+
113+
# Check if the binding already exists, and if does, do nothing more
114+
for binding in policy.bindings:
115+
if binding.role == role:
116+
if service_account in binding.members:
117+
return
118+
119+
# Create a new binding
120+
new_binding = policy_pb2.Binding(role=role, members=[service_account])
121+
policy.bindings.append(new_binding)
122+
request = iam_policy_pb2.SetIamPolicyRequest(resource=project, policy=policy)
123+
self._cloud_resource_manager_client.set_iam_policy(request=request)
124+
125+
# We would wait for the IAM policy change to take effect
126+
# https://cloud.google.com/iam/docs/access-change-propagation
127+
logger.info(
128+
f"Waiting {self._IAM_WAIT_SECONDS} seconds for IAM to take effect.."
129+
)
130+
time.sleep(self._IAM_WAIT_SECONDS)
131+
132+
def _create_bq_connection(self, project_id: str, location: str, connection_id: str):
133+
"""Create the BigQuery Connection and returns corresponding service account id."""
134+
client = self._bq_connection_client
135+
connection = bigquery_connection_v1.Connection(
136+
cloud_resource=bigquery_connection_v1.CloudResourceProperties()
137+
)
138+
request = bigquery_connection_v1.CreateConnectionRequest(
139+
parent=client.common_location_path(project_id, location),
140+
connection_id=connection_id,
141+
connection=connection,
142+
)
143+
connection = client.create_connection(request)
144+
return connection.name, connection.cloud_resource.service_account_id
145+
146+
def _get_service_account_if_connection_exists(
147+
self, project_id: str, location: str, connection_id: str
148+
) -> Optional[str]:
149+
"""Check if the BigQuery Connection exists."""
150+
client = self._bq_connection_client
151+
request = bigquery_connection_v1.GetConnectionRequest(
152+
name=client.connection_path(project_id, location, connection_id)
153+
)
154+
155+
service_account = None
156+
try:
157+
service_account = client.get_connection(
158+
request=request
159+
).cloud_resource.service_account_id
160+
except google.api_core.exceptions.NotFound:
161+
pass
162+
163+
return service_account

bigframes/core/__init__.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,20 +1021,6 @@ def slice(
10211021
if not step:
10221022
step = 1
10231023

1024-
# Special cases for head() and tail(), where we don't need to project
1025-
# offsets. LIMIT clause is much more efficient in BigQuery than a
1026-
# filter on row_number().
1027-
if (
1028-
(start is None or start == 0)
1029-
and step == 1
1030-
and stop is not None
1031-
and stop > 0
1032-
):
1033-
return self.apply_limit(stop)
1034-
1035-
if start is not None and start < 0 and step == 1 and stop is None:
1036-
return self.reversed().apply_limit(abs(start)).reversed()
1037-
10381024
expr_with_offsets = self.project_offsets()
10391025

10401026
# start with True and reduce with start, stop, and step conditions

0 commit comments

Comments
 (0)