-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
/
Copy pathorc.py
245 lines (206 loc) · 8.19 KB
/
orc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
""" orc compat """
from __future__ import annotations
import io
from types import ModuleType
from typing import (
TYPE_CHECKING,
Any,
Literal,
)
from pandas._config import using_pyarrow_string_dtype
from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.util._validators import check_dtype_backend
import pandas as pd
from pandas.core.indexes.api import default_index
from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import (
get_handle,
is_fsspec_url,
)
if TYPE_CHECKING:
import fsspec
import pyarrow.fs
from pandas._typing import (
DtypeBackend,
FilePath,
ReadBuffer,
WriteBuffer,
)
from pandas.core.frame import DataFrame
def read_orc(
path: FilePath | ReadBuffer[bytes],
columns: list[str] | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
filesystem: pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None = None,
**kwargs: Any,
) -> DataFrame:
"""
Load an ORC object from the file path, returning a DataFrame.
Parameters
----------
path : str, path object, or file-like object
String, path object (implementing ``os.PathLike[str]``), or file-like
object implementing a binary ``read()`` function. The string could be a URL.
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.orc``.
columns : list, default None
If not None, only these columns will be read from the file.
Output always follows the ordering of the file and not the columns list.
This mirrors the original behaviour of
:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
Back-end data type applied to the resultant :class:`DataFrame`
(still experimental). Behaviour is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
(default).
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
DataFrame.
.. versionadded:: 2.0
filesystem : fsspec or pyarrow filesystem, default None
Filesystem object to use when reading the parquet file.
.. versionadded:: 2.1.0
**kwargs
Any additional kwargs are passed to pyarrow.
Returns
-------
DataFrame
Notes
-----
Before using this function you should read the :ref:`user guide about ORC <io.orc>`
and :ref:`install optional dependencies <install.warn_orc>`.
If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),
a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a
pyarrow or fsspec filesystem object into the filesystem keyword to override this
behavior.
Examples
--------
>>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP
"""
# we require a newer version of pyarrow than we support for parquet
orc = import_optional_dependency("pyarrow.orc")
check_dtype_backend(dtype_backend)
with get_handle(path, "rb", is_text=False) as handles:
source = handles.handle
if is_fsspec_url(path) and filesystem is None:
pa = import_optional_dependency("pyarrow")
pa_fs = import_optional_dependency("pyarrow.fs")
try:
filesystem, source = pa_fs.FileSystem.from_uri(path)
except (TypeError, pa.ArrowInvalid):
pass
pa_table = orc.read_table(
source=source, columns=columns, filesystem=filesystem, **kwargs
)
if dtype_backend is not lib.no_default:
if dtype_backend == "pyarrow":
df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
else:
from pandas.io._util import _arrow_dtype_mapping
mapping = _arrow_dtype_mapping()
df = pa_table.to_pandas(types_mapper=mapping.get)
return df
else:
if using_pyarrow_string_dtype():
types_mapper = arrow_string_types_mapper()
else:
types_mapper = None
return pa_table.to_pandas(types_mapper=types_mapper)
def to_orc(
df: DataFrame,
path: FilePath | WriteBuffer[bytes] | None = None,
*,
engine: Literal["pyarrow"] = "pyarrow",
index: bool | None = None,
engine_kwargs: dict[str, Any] | None = None,
) -> bytes | None:
"""
Write a DataFrame to the ORC format.
.. versionadded:: 1.5.0
Parameters
----------
df : DataFrame
The dataframe to be written to ORC. Raises NotImplementedError
if dtype of one or more columns is category, unsigned integers,
intervals, periods or sparse.
path : str, file-like object or None, default None
If a string, it will be used as Root Directory path
when writing a partitioned dataset. By file-like object,
we refer to objects with a write() method, such as a file handle
(e.g. via builtin open function). If path is None,
a bytes object is returned.
engine : str, default 'pyarrow'
ORC library to use.
index : bool, optional
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file.
If ``None``, similar to ``infer`` the dataframe's index(es)
will be saved. However, instead of being saved as values,
the RangeIndex will be stored as a range in the metadata so it
doesn't require much space and is faster. Other indexes will
be included as columns in the file output.
engine_kwargs : dict[str, Any] or None, default None
Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
Returns
-------
bytes if no path argument is provided else None
Raises
------
NotImplementedError
Dtype of one or more columns is category, unsigned integers, interval,
period or sparse.
ValueError
engine is not pyarrow.
Notes
-----
* Before using this function you should read the
:ref:`user guide about ORC <io.orc>` and
:ref:`install optional dependencies <install.warn_orc>`.
* This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
library.
* For supported dtypes please refer to `supported ORC features in Arrow
<https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
* Currently timezones in datetime columns are not preserved when a
dataframe is converted into ORC files.
"""
if index is None:
index = df.index.names[0] is not None
if engine_kwargs is None:
engine_kwargs = {}
# validate index
# --------------
# validate that we have only a default index
# raise on anything else as we don't serialize the index
if not df.index.equals(default_index(len(df))):
raise ValueError(
"orc does not support serializing a non-default index for the index; "
"you can .reset_index() to make the index into column(s)"
)
if df.index.name is not None:
raise ValueError("orc does not serialize index meta-data on a default index")
if engine != "pyarrow":
raise ValueError("engine must be 'pyarrow'")
engine = import_optional_dependency(engine, min_version="10.0.1")
pa = import_optional_dependency("pyarrow")
orc = import_optional_dependency("pyarrow.orc")
was_none = path is None
if was_none:
path = io.BytesIO()
assert path is not None # For mypy
with get_handle(path, "wb", is_text=False) as handles:
assert isinstance(engine, ModuleType) # For mypy
try:
orc.write_table(
engine.Table.from_pandas(df, preserve_index=index),
handles.handle,
**engine_kwargs,
)
except (TypeError, pa.ArrowNotImplementedError) as e:
raise NotImplementedError(
"The dtype of one or more columns is not supported yet."
) from e
if was_none:
assert isinstance(path, io.BytesIO) # For mypy
return path.getvalue()
return None