Skip to content

Commit 3b35860

Browse files
authored
feat: enable read_csv() to process other files (#940)
* add tests * feat: enable read_csv() to process other files * update to main * add docs
1 parent cccc6ca commit 3b35860

File tree

4 files changed

+33
-6
lines changed

4 files changed

+33
-6
lines changed

bigframes/session/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,10 +1008,12 @@ def _check_file_size(self, filepath: str):
10081008
blob = bucket.blob(blob_name)
10091009
blob.reload()
10101010
file_size = blob.size
1011-
else: # local file path
1011+
elif os.path.exists(filepath): # local file path
10121012
file_size = os.path.getsize(filepath)
1013+
else:
1014+
file_size = None
10131015

1014-
if file_size > max_size:
1016+
if file_size is not None and file_size > max_size:
10151017
# Convert to GB
10161018
file_size = round(file_size / (1024**3), 1)
10171019
max_size = int(max_size / 1024**3)

bigframes/session/loader.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import dataclasses
1919
import datetime
2020
import itertools
21+
import os
2122
import typing
2223
from typing import Dict, Hashable, IO, Iterable, List, Optional, Sequence, Tuple, Union
2324

@@ -421,11 +422,16 @@ def _read_bigquery_load_job(
421422
load_job = self._bqclient.load_table_from_uri(
422423
filepath_or_buffer, table, job_config=job_config
423424
)
424-
else:
425+
elif os.path.exists(filepath_or_buffer): # local file path
425426
with open(filepath_or_buffer, "rb") as source_file:
426427
load_job = self._bqclient.load_table_from_file(
427428
source_file, table, job_config=job_config
428429
)
430+
else:
431+
raise NotImplementedError(
432+
f"BigQuery engine only supports a local file path or GCS path. "
433+
f"{constants.FEEDBACK_LINK}"
434+
)
429435
else:
430436
load_job = self._bqclient.load_table_from_file(
431437
filepath_or_buffer, table, job_config=job_config

tests/system/small/test_session.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,6 +1036,25 @@ def test_read_csv_local_w_usecols(session, scalars_pandas_df_index, engine):
10361036
assert len(df.columns) == 1
10371037

10381038

1039+
@pytest.mark.parametrize(
1040+
"engine",
1041+
[
1042+
pytest.param(
1043+
"bigquery",
1044+
id="bq_engine",
1045+
marks=pytest.mark.xfail(
1046+
raises=NotImplementedError,
1047+
),
1048+
),
1049+
pytest.param(None, id="default_engine"),
1050+
],
1051+
)
1052+
def test_read_csv_others(session, engine):
1053+
uri = "https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/main/tests/data/people.csv"
1054+
df = session.read_csv(uri, engine=engine)
1055+
assert len(df.columns) == 3
1056+
1057+
10391058
@pytest.mark.parametrize(
10401059
"engine",
10411060
[

third_party/bigframes_vendored/pandas/io/parsers/readers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,16 @@ def read_csv(
5151
encoding: Optional[str] = None,
5252
**kwargs,
5353
):
54-
"""Loads DataFrame from comma-separated values (csv) file locally or from
55-
Cloud Storage.
54+
"""Loads data from a comma-separated values (csv) file into a DataFrame.
5655
5756
The CSV file data will be persisted as a temporary BigQuery table, which can be
5857
automatically recycled after the Session is closed.
5958
6059
.. note::
6160
using `engine="bigquery"` will not guarantee the same ordering as the
6261
file. Instead, set a serialized index column as the index and sort by
63-
that in the resulting DataFrame.
62+
that in the resulting DataFrame. Only files stored on your local machine
63+
or in Google Cloud Storage are supported.
6464
6565
.. note::
6666
For non-bigquery engine, data is inlined in the query SQL if it is

0 commit comments

Comments
 (0)