21
21
import logging
22
22
import os
23
23
import re
24
+ import secrets
24
25
import typing
25
26
from typing import (
26
27
Any ,
37
38
Tuple ,
38
39
Union ,
39
40
)
41
+ import uuid
40
42
import warnings
41
43
42
44
# Even though the ibis.backends.bigquery import is unused, it's needed
100
102
101
103
_BIGFRAMES_DEFAULT_CONNECTION_ID = "bigframes-default-connection"
102
104
105
+ _TEMP_TABLE_ID_FORMAT = "bqdf{date}_{session_id}_{random_id}"
106
+
103
107
_MAX_CLUSTER_COLUMNS = 4
104
108
105
109
# TODO(swast): Need to connect to regional endpoints when performing remote
@@ -203,7 +207,11 @@ def __init__(
203
207
bq_kms_key_name = self ._bq_kms_key_name ,
204
208
)
205
209
206
- self ._create_bq_datasets ()
210
+ self ._anonymous_dataset = (
211
+ bigframes .session ._io .bigquery .create_bq_dataset_reference (
212
+ self .bqclient , location = self ._location
213
+ )
214
+ )
207
215
208
216
# TODO(shobs): Remove this logic after https://github.com/ibis-project/ibis/issues/8494
209
217
# has been fixed. The ibis client changes the default query job config
@@ -233,6 +241,13 @@ def __init__(
233
241
bigquery .TableReference , Tuple [datetime .datetime , bigquery .Table ]
234
242
] = {}
235
243
244
+ # unique session identifier, short enough to be human readable
245
+ # only needs to be unique among sessions created by the same user
246
+ # at the same time in the same region
247
+ self ._session_id : str = "session" + secrets .token_hex (3 )
248
+ self ._table_ids : List [str ] = []
249
+ # store table ids and delete them when the session is closed
250
+
236
251
@property
237
252
def bqclient (self ):
238
253
return self ._clients_provider .bqclient
@@ -263,6 +278,10 @@ def bqconnectionmanager(self):
263
278
)
264
279
return self ._bq_connection_manager
265
280
281
+ @property
282
+ def session_id (self ):
283
+ return self ._session_id
284
+
266
285
@property
267
286
def _project (self ):
268
287
return self .bqclient .project
@@ -271,24 +290,15 @@ def __hash__(self):
271
290
# Stable hash needed to use in expression tree
272
291
return hash (str (self ._anonymous_dataset ))
273
292
274
- def _create_bq_datasets (self ):
275
- """Create and identify dataset(s) for temporary BQ resources."""
276
- query_job = self .bqclient .query ("SELECT 1" , location = self ._location )
277
- query_job .result () # blocks until finished
278
-
279
- # The anonymous dataset is used by BigQuery to write query results and
280
- # session tables. BigQuery DataFrames also writes temp tables directly
281
- # to the dataset, no BigQuery Session required. Note: there is a
282
- # different anonymous dataset per location. See:
283
- # https://cloud.google.com/bigquery/docs/cached-results#how_cached_results_are_stored
284
- query_destination = query_job .destination
285
- self ._anonymous_dataset = bigquery .DatasetReference (
286
- query_destination .project ,
287
- query_destination .dataset_id ,
288
- )
289
-
290
293
def close (self ):
291
- """No-op. Temporary resources are deleted after 7 days."""
294
+ """Delete tables that were created with this session's session_id."""
295
+ client = self .bqclient
296
+ project_id = self ._anonymous_dataset .project
297
+ dataset_id = self ._anonymous_dataset .dataset_id
298
+
299
+ for table_id in self ._table_ids :
300
+ full_id = "." .join ([project_id , dataset_id , table_id ])
301
+ client .delete_table (full_id , not_found_ok = True )
292
302
293
303
def read_gbq (
294
304
self ,
@@ -1063,7 +1073,7 @@ def _read_pandas_load_job(
1063
1073
1064
1074
job_config .labels = {"bigframes-api" : api_name }
1065
1075
1066
- load_table_destination = bigframes_io . random_table ( self ._anonymous_dataset )
1076
+ load_table_destination = self ._random_table ( )
1067
1077
load_job = self .bqclient .load_table_from_dataframe (
1068
1078
pandas_dataframe_copy ,
1069
1079
load_table_destination ,
@@ -1145,7 +1155,7 @@ def read_csv(
1145
1155
encoding : Optional [str ] = None ,
1146
1156
** kwargs ,
1147
1157
) -> dataframe .DataFrame :
1148
- table = bigframes_io . random_table ( self ._anonymous_dataset )
1158
+ table = self ._random_table ( )
1149
1159
1150
1160
if engine is not None and engine == "bigquery" :
1151
1161
if any (param is not None for param in (dtype , names )):
@@ -1282,7 +1292,7 @@ def read_parquet(
1282
1292
* ,
1283
1293
engine : str = "auto" ,
1284
1294
) -> dataframe .DataFrame :
1285
- table = bigframes_io . random_table ( self ._anonymous_dataset )
1295
+ table = self ._random_table ( )
1286
1296
1287
1297
if engine == "bigquery" :
1288
1298
job_config = self ._prepare_load_job_config ()
@@ -1319,7 +1329,7 @@ def read_json(
1319
1329
engine : Literal ["ujson" , "pyarrow" , "bigquery" ] = "ujson" ,
1320
1330
** kwargs ,
1321
1331
) -> dataframe .DataFrame :
1322
- table = bigframes_io . random_table ( self ._anonymous_dataset )
1332
+ table = self ._random_table ( )
1323
1333
1324
1334
if engine == "bigquery" :
1325
1335
@@ -1416,14 +1426,12 @@ def _create_empty_temp_table(
1416
1426
) -> bigquery .TableReference :
1417
1427
# Can't set a table in _SESSION as destination via query job API, so we
1418
1428
# run DDL, instead.
1419
- dataset = self ._anonymous_dataset
1420
1429
expiration = (
1421
1430
datetime .datetime .now (datetime .timezone .utc ) + constants .DEFAULT_EXPIRATION
1422
1431
)
1423
1432
1424
1433
table = bigframes_io .create_temp_table (
1425
- self .bqclient ,
1426
- dataset ,
1434
+ self ,
1427
1435
expiration ,
1428
1436
schema = schema ,
1429
1437
cluster_columns = cluster_cols ,
@@ -1939,6 +1947,32 @@ def _start_generic_job(self, job: formatting_helpers.GenericJob):
1939
1947
else :
1940
1948
job .result ()
1941
1949
1950
+ def _random_table (self , skip_cleanup : bool = False ) -> bigquery .TableReference :
1951
+ """Generate a random table ID with BigQuery DataFrames prefix.
1952
+
1953
+ The generated ID will be stored and checked for deletion when the
1954
+ session is closed, unless skip_cleanup is True.
1955
+
1956
+ Args:
1957
+ skip_cleanup (bool, default False):
1958
+ If True, do not add the generated ID to the list of tables
1959
+ to clean up when the session is closed.
1960
+
1961
+ Returns:
1962
+ google.cloud.bigquery.TableReference:
1963
+ Fully qualified table ID of a table that doesn't exist.
1964
+ """
1965
+ dataset = self ._anonymous_dataset
1966
+ session_id = self .session_id
1967
+ now = datetime .datetime .now (datetime .timezone .utc )
1968
+ random_id = uuid .uuid4 ().hex
1969
+ table_id = _TEMP_TABLE_ID_FORMAT .format (
1970
+ date = now .strftime ("%Y%m%d" ), session_id = session_id , random_id = random_id
1971
+ )
1972
+ if not skip_cleanup :
1973
+ self ._table_ids .append (table_id )
1974
+ return dataset .table (table_id )
1975
+
1942
1976
1943
1977
def connect (context : Optional [bigquery_options .BigQueryOptions ] = None ) -> Session :
1944
1978
return Session (context )
0 commit comments