-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Allow StringArray[python] to be backed by numpy StringDType in numpy 2.0 #58578
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
56ae252
206d2f0
5adadfa
a1175f2
d6d21c8
7426cd5
2f4ab45
d92f0cb
8e59bba
f2f0798
64f85d3
82922d0
1654f8b
2d736ae
87e2d14
c68ea5a
0f0589e
ca39aaf
88c7d5d
41ab894
43b3ce7
13cf458
ffb5ab7
e6a6d6d
8cf1081
7e5ea63
6a5563f
65abaa6
43a6a2a
23f594b
85609ca
86ffe1c
dc9419d
155ec68
8dadaf9
aad5f32
190ffe3
dcf2cec
b5cdea8
ba0a8b4
10437a0
5691409
4b3e48b
1e1d651
d27816c
19d85bb
11778ed
2034a25
151fe64
8394495
aa7cec9
dfedd1e
d64dcf8
8e32211
187d068
3626c63
908c9e1
70be1f6
ffe133b
b684da0
a202f1b
1a0e783
7e0649f
fd2ba65
f301506
2c46b75
4a538a0
c88884a
d0e3f1e
a175c7a
961a67c
37143be
fbabedc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in | |
""" | ||
from cpython.unicode cimport PyUnicode_AsUTF8 | ||
|
||
from numpy cimport ( | ||
flatiter, | ||
PyArray_GETITEM, | ||
PyArray_ITER_DATA, | ||
PyArray_ITER_NEXT, | ||
PyArray_IterNew, | ||
) | ||
|
||
|
||
from libc.string cimport strdup | ||
|
||
{{py: | ||
|
||
# name | ||
|
@@ -970,7 +981,12 @@ cdef class StringHashTable(HashTable): | |
kh_resize_str(self.table, size_hint) | ||
|
||
def __dealloc__(self): | ||
cdef: | ||
khiter_t k | ||
if self.table is not NULL: | ||
for k in range(kh_end(self.table)): | ||
if kh_exist(self.table, k): | ||
free(<char*>kh_key(self.table, k)) | ||
kh_destroy_str(self.table) | ||
self.table = NULL | ||
|
||
|
@@ -1013,6 +1029,8 @@ cdef class StringHashTable(HashTable): | |
|
||
v = PyUnicode_AsUTF8(key) | ||
|
||
v = strdup(v) | ||
|
||
k = kh_put_str(self.table, v, &ret) | ||
if kh_exist_str(self.table, k): | ||
self.table.vals[k] = val | ||
|
@@ -1051,7 +1069,7 @@ cdef class StringHashTable(HashTable): | |
return labels | ||
|
||
@cython.boundscheck(False) | ||
def lookup(self, ndarray[object] values, object mask = None) -> ndarray: | ||
def lookup(self, ndarray values, object mask = None) -> ndarray: | ||
# -> np.ndarray[np.intp] | ||
# mask not yet implemented | ||
cdef: | ||
|
@@ -1061,22 +1079,34 @@ cdef class StringHashTable(HashTable): | |
const char *v | ||
khiter_t k | ||
intp_t[::1] locs = np.empty(n, dtype=np.intp) | ||
flatiter it = PyArray_IterNew(values) | ||
|
||
# these by-definition *must* be strings | ||
vecs = <const char **>malloc(n * sizeof(char *)) | ||
if vecs is NULL: | ||
raise MemoryError() | ||
for i in range(n): | ||
val = values[i] | ||
val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) | ||
|
||
if isinstance(val, str): | ||
# GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize | ||
# it as a str, even though isinstance does. | ||
v = PyUnicode_AsUTF8(<str>val) | ||
else: | ||
v = PyUnicode_AsUTF8(self.na_string_sentinel) | ||
|
||
# Need to copy result from PyUnicode_AsUTF8 when we have | ||
# numpy strings | ||
# Since numpy strings aren't backed by object arrays | ||
# the buffer returned by PyUnicode_AsUTF8 will get freed | ||
# in the next iteration when the created str object is GC'ed, | ||
# clobbering the value of v | ||
v = strdup(v) | ||
|
||
vecs[i] = v | ||
|
||
PyArray_ITER_NEXT(it) | ||
|
||
with nogil: | ||
for i in range(n): | ||
v = vecs[i] | ||
|
@@ -1086,11 +1116,16 @@ cdef class StringHashTable(HashTable): | |
else: | ||
locs[i] = -1 | ||
|
||
if values.dtype.kind == "T": | ||
# free copied strings | ||
for i in range(n): | ||
free(vecs[i]) | ||
|
||
free(vecs) | ||
return np.asarray(locs) | ||
|
||
@cython.boundscheck(False) | ||
def map_locations(self, ndarray[object] values, object mask = None) -> None: | ||
def map_locations(self, ndarray values, object mask = None) -> None: | ||
# mask not yet implemented | ||
cdef: | ||
Py_ssize_t i, n = len(values) | ||
|
@@ -1099,32 +1134,45 @@ cdef class StringHashTable(HashTable): | |
const char *v | ||
const char **vecs | ||
khiter_t k | ||
flatiter it = PyArray_IterNew(values) | ||
|
||
# these by-definition *must* be strings | ||
vecs = <const char **>malloc(n * sizeof(char *)) | ||
if vecs is NULL: | ||
raise MemoryError() | ||
for i in range(n): | ||
val = values[i] | ||
val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) | ||
|
||
if isinstance(val, str): | ||
# GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize | ||
# it as a str, even though isinstance does. | ||
v = PyUnicode_AsUTF8(<str>val) | ||
else: | ||
v = PyUnicode_AsUTF8(self.na_string_sentinel) | ||
|
||
# Need to copy result from PyUnicode_AsUTF8 when we have | ||
# numpy strings | ||
# Since numpy strings aren't backed by object arrays | ||
# the buffer returned by PyUnicode_AsUTF8 will get freed | ||
# in the next iteration when the created str object is GC'ed, | ||
# clobbering the value of v | ||
v = strdup(v) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This probably leaks in the current implementation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I should be freeing these strings in EDIT: Nevermind, I'm stupid 😓 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No I wouldn't put it there either - |
||
|
||
vecs[i] = v | ||
|
||
PyArray_ITER_NEXT(it) | ||
|
||
with nogil: | ||
for i in range(n): | ||
v = vecs[i] | ||
k = kh_put_str(self.table, v, &ret) | ||
self.table.vals[k] = i | ||
|
||
free(vecs) | ||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def _unique(self, ndarray[object] values, ObjectVector uniques, | ||
def _unique(self, ndarray values, ObjectVector uniques, | ||
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, | ||
object na_value=None, bint ignore_na=False, | ||
bint return_inverse=False): | ||
|
@@ -1171,11 +1219,13 @@ cdef class StringHashTable(HashTable): | |
const char **vecs | ||
khiter_t k | ||
bint use_na_value | ||
flatiter it = PyArray_IterNew(values) | ||
bint non_null_na_value | ||
|
||
if return_inverse: | ||
labels = np.zeros(n, dtype=np.intp) | ||
uindexer = np.empty(n, dtype=np.int64) | ||
|
||
use_na_value = na_value is not None | ||
non_null_na_value = not checknull(na_value) | ||
|
||
|
@@ -1184,7 +1234,7 @@ cdef class StringHashTable(HashTable): | |
if vecs is NULL: | ||
raise MemoryError() | ||
for i in range(n): | ||
val = values[i] | ||
val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) | ||
|
||
if (ignore_na | ||
and (not isinstance(val, str) | ||
|
@@ -1202,10 +1252,22 @@ cdef class StringHashTable(HashTable): | |
# if ignore_na is False, we also stringify NaN/None/etc. | ||
try: | ||
v = PyUnicode_AsUTF8(<str>val) | ||
except UnicodeEncodeError: | ||
except (UnicodeEncodeError,TypeError): | ||
# pd.NA will raise TypeError | ||
v = PyUnicode_AsUTF8(<str>repr(val)) | ||
|
||
# Need to copy result from PyUnicode_AsUTF8 when we have | ||
# numpy strings | ||
# Since numpy strings aren't backed by object arrays | ||
# the buffer returned by PyUnicode_AsUTF8 will get freed | ||
# in the next iteration when the created str object is GC'ed, | ||
# clobbering the value of v | ||
v = strdup(v) | ||
|
||
vecs[i] = v | ||
|
||
PyArray_ITER_NEXT(it) | ||
|
||
# compute | ||
with nogil: | ||
for i in range(n): | ||
|
@@ -1239,7 +1301,7 @@ cdef class StringHashTable(HashTable): | |
return uniques.to_array(), labels.base # .base -> underlying ndarray | ||
return uniques.to_array() | ||
|
||
def unique(self, ndarray[object] values, *, bint return_inverse=False, object mask=None): | ||
def unique(self, ndarray values, *, bint return_inverse=False, object mask=None): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -1264,7 +1326,7 @@ cdef class StringHashTable(HashTable): | |
return self._unique(values, uniques, ignore_na=False, | ||
return_inverse=return_inverse) | ||
|
||
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, | ||
def factorize(self, ndarray values, Py_ssize_t na_sentinel=-1, | ||
object na_value=None, object mask=None, ignore_na=True): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,5 +33,6 @@ | |
"PeriodArray", | ||
"SparseArray", | ||
"StringArray", | ||
"ObjectStringArray", | ||
"TimedeltaArray", | ||
] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm a bit wary of managing lifecycle this way - so the existing implemention has no ownership of the string lifecycle then right? Its probably easier to make that a StringView hash table then and creating a dedicated String hash table which does copy
This is another case where using C++ would be a better language choice than tempita (see also https://github.com/pandas-dev/pandas/pull/57730/files)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, editing anything in tempita kinda sucks in general.
But yes, I think the existing implementation doesn't have ownership of the Python string objects.
Turning this into StringViewHashTable, and subclassing this sounds good to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You could get the UTF-8 string data from the array entry directly, without going throuh
PyArray_GETITEM
via the NumPy C API:https://numpy.org/neps/nep-0055-string_dtype.html#packing-and-loading-strings
There aren't cython bindings for this API yet in the numpy cython bindings but it's on my list of things to do. It probably makes sense to manage the allocators with a context manager, for example.
I also see that the new C API isn't yet covered in the C API docs and I need to make sure there are docs for the stringdtype C API before the 2.0 release happens. Thank you for prompting me to notice that oversight!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
numpy/numpy#26418